Skip to content

Commit adcc0ae

Browse files
chuckleverTrond Myklebust
authored andcommitted
NFS: Use NFSv4.2's OFFLOAD_STATUS operation
We've found that there are cases where a transport disconnection results in the loss of callback RPCs. NFS servers typically do not retransmit callback operations after a disconnect. This can be a problem for the Linux NFS client's current implementation of asynchronous COPY, which waits indefinitely for a CB_OFFLOAD callback. If a transport disconnect occurs while an async COPY is running, there's a good chance the client will never get the completing CB_OFFLOAD. Fix this by implementing the OFFLOAD_STATUS operation so that the Linux NFS client can probe the NFS server if it doesn't see a CB_OFFLOAD in a reasonable amount of time. This patch implements a simplistic check. As future work, the client might also be able to detect whether there is no forward progress on the request asynchronous COPY operation, and CANCEL it. Suggested-by: Olga Kornievskaia <[email protected]> Link: https://bugzilla.kernel.org/show_bug.cgi?id=218735 Reviewed-by: Jeff Layton <[email protected]> Signed-off-by: Chuck Lever <[email protected]> Reviewed-by: Benjamin Coddington <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Trond Myklebust <[email protected]>
1 parent 77dd8a3 commit adcc0ae

File tree

1 file changed

+59
-11
lines changed

1 file changed

+59
-11
lines changed

fs/nfs/nfs42proc.c

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,20 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
175175
return err;
176176
}
177177

178+
static void nfs4_copy_dequeue_callback(struct nfs_server *dst_server,
179+
struct nfs_server *src_server,
180+
struct nfs4_copy_state *copy)
181+
{
182+
spin_lock(&dst_server->nfs_client->cl_lock);
183+
list_del_init(&copy->copies);
184+
spin_unlock(&dst_server->nfs_client->cl_lock);
185+
if (dst_server != src_server) {
186+
spin_lock(&src_server->nfs_client->cl_lock);
187+
list_del_init(&copy->src_copies);
188+
spin_unlock(&src_server->nfs_client->cl_lock);
189+
}
190+
}
191+
178192
static int handle_async_copy(struct nfs42_copy_res *res,
179193
struct nfs_server *dst_server,
180194
struct nfs_server *src_server,
@@ -184,9 +198,12 @@ static int handle_async_copy(struct nfs42_copy_res *res,
184198
bool *restart)
185199
{
186200
struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter;
187-
int status = NFS4_OK;
188201
struct nfs_open_context *dst_ctx = nfs_file_open_context(dst);
189202
struct nfs_open_context *src_ctx = nfs_file_open_context(src);
203+
struct nfs_client *clp = dst_server->nfs_client;
204+
unsigned long timeout = 3 * HZ;
205+
int status = NFS4_OK;
206+
u64 copied;
190207

191208
copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
192209
if (!copy)
@@ -224,15 +241,12 @@ static int handle_async_copy(struct nfs42_copy_res *res,
224241
spin_unlock(&src_server->nfs_client->cl_lock);
225242
}
226243

227-
status = wait_for_completion_interruptible(&copy->completion);
228-
spin_lock(&dst_server->nfs_client->cl_lock);
229-
list_del_init(&copy->copies);
230-
spin_unlock(&dst_server->nfs_client->cl_lock);
231-
if (dst_server != src_server) {
232-
spin_lock(&src_server->nfs_client->cl_lock);
233-
list_del_init(&copy->src_copies);
234-
spin_unlock(&src_server->nfs_client->cl_lock);
235-
}
244+
wait:
245+
status = wait_for_completion_interruptible_timeout(&copy->completion,
246+
timeout);
247+
if (!status)
248+
goto timeout;
249+
nfs4_copy_dequeue_callback(dst_server, src_server, copy);
236250
if (status == -ERESTARTSYS) {
237251
goto out_cancel;
238252
} else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) {
@@ -242,6 +256,7 @@ static int handle_async_copy(struct nfs42_copy_res *res,
242256
}
243257
out:
244258
res->write_res.count = copy->count;
259+
/* Copy out the updated write verifier provided by CB_OFFLOAD. */
245260
memcpy(&res->write_res.verifier, &copy->verf, sizeof(copy->verf));
246261
status = -copy->error;
247262

@@ -253,6 +268,39 @@ static int handle_async_copy(struct nfs42_copy_res *res,
253268
if (!nfs42_files_from_same_server(src, dst))
254269
nfs42_do_offload_cancel_async(src, src_stateid);
255270
goto out_free;
271+
timeout:
272+
timeout <<= 1;
273+
if (timeout > (clp->cl_lease_time >> 1))
274+
timeout = clp->cl_lease_time >> 1;
275+
status = nfs42_proc_offload_status(dst, &copy->stateid, &copied);
276+
if (status == -EINPROGRESS)
277+
goto wait;
278+
nfs4_copy_dequeue_callback(dst_server, src_server, copy);
279+
switch (status) {
280+
case 0:
281+
/* The server recognized the copy stateid, so it hasn't
282+
* rebooted. Don't overwrite the verifier returned in the
283+
* COPY result. */
284+
res->write_res.count = copied;
285+
goto out_free;
286+
case -EREMOTEIO:
287+
/* COPY operation failed on the server. */
288+
status = -EOPNOTSUPP;
289+
res->write_res.count = copied;
290+
goto out_free;
291+
case -EBADF:
292+
/* Server did not recognize the copy stateid. It has
293+
* probably restarted and lost the plot. */
294+
res->write_res.count = 0;
295+
status = -EOPNOTSUPP;
296+
break;
297+
case -EOPNOTSUPP:
298+
/* RFC 7862 REQUIREs server to support OFFLOAD_STATUS when
299+
* it has signed up for an async COPY, so server is not
300+
* spec-compliant. */
301+
res->write_res.count = 0;
302+
}
303+
goto out_free;
256304
}
257305

258306
static int process_copy_commit(struct file *dst, loff_t pos_dst,
@@ -643,7 +691,7 @@ _nfs42_proc_offload_status(struct nfs_server *server, struct file *file,
643691
* Other negative errnos indicate the client could not complete the
644692
* request.
645693
*/
646-
static int __maybe_unused
694+
static int
647695
nfs42_proc_offload_status(struct file *dst, nfs4_stateid *stateid, u64 *copied)
648696
{
649697
struct inode *inode = file_inode(dst);

0 commit comments

Comments
 (0)