Skip to content

Commit 4882ba7

Browse files
committed
afs: Fix afs_server ref accounting
The current way that afs_server refs are accounted and cleaned up sometimes cause rmmod to hang when it is waiting for cell records to be removed. The problem is that the cell cleanup might occasionally happen before the server cleanup and then there's nothing that causes the cell to garbage-collect the remaining servers as they become inactive. Partially fix this by: (1) Give each afs_server record its own management timer that rather than relying on the cell manager's central timer to drive each individual cell's maintenance work item to garbage collect servers. This timer is set when afs_unuse_server() reduces a server's activity count to zero and will schedule the server's destroyer work item upon firing. (2) Give each afs_server record its own destroyer work item that removes the record from the cell's database, shuts down the timer, cancels any pending work for itself, sends an RPC to the server to cancel outstanding callbacks. This change, in combination with the timer, obviates the need to try and coordinate so closely between the cell record and a bunch of other server records to try and tear everything down in a coordinated fashion. With this, the cell record is pinned until the server RCU is complete and namespace/module removal will wait until all the cell records are removed. (3) Now that incoming calls are mapped to servers (and thus cells) using data attached to an rxrpc_peer, the UUID-to-server mapping tree is moved from the namespace to the cell (cell->fs_servers). This means there can no longer be duplicates therein - and that allows the mapping tree to be simpler as there doesn't need to be a chain of same-UUID servers that are in different cells. (4) The lock protecting the UUID mapping tree is switched to an rw_semaphore on the cell rather than a seqlock on the namespace as it's now only used during mounting in contexts in which we're allowed to sleep. (5) When it comes time for a cell that is being removed to purge its set of servers, it just needs to iterate over them and wake them up. Once a server becomes inactive, its destroyer work item will observe the state of the cell and immediately remove that record. (6) When a server record is removed, it is marked AFS_SERVER_FL_EXPIRED to prevent reattempts at removal. The record will be dispatched to RCU for destruction once its refcount reaches 0. (7) The AFS_SERVER_FL_UNCREATED/CREATING flags are used to synchronise simultaneous creation attempts. If one attempt fails, it will abandon the attempt and allow another to try again. Note that the record can't just be abandoned when dead as it's bound into a server list attached to a volume and only subject to replacement if the server list obtained for the volume from the VLDB changes. Signed-off-by: David Howells <[email protected]> cc: Marc Dionne <[email protected]> cc: [email protected] cc: [email protected] Link: https://lore.kernel.org/r/[email protected]/ # v1 Link: https://lore.kernel.org/r/[email protected]/ # v4
1 parent 40e8b52 commit 4882ba7

File tree

7 files changed

+289
-357
lines changed

7 files changed

+289
-357
lines changed

fs/afs/cell.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
169169
INIT_HLIST_HEAD(&cell->proc_volumes);
170170
seqlock_init(&cell->volume_lock);
171171
cell->fs_servers = RB_ROOT;
172-
seqlock_init(&cell->fs_lock);
172+
init_rwsem(&cell->fs_lock);
173173
rwlock_init(&cell->vl_servers_lock);
174174
cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS);
175175

@@ -838,6 +838,7 @@ static void afs_manage_cell(struct afs_cell *cell)
838838
/* The root volume is pinning the cell */
839839
afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root);
840840
cell->root_volume = NULL;
841+
afs_purge_servers(cell);
841842
afs_put_cell(cell, afs_cell_trace_put_destroy);
842843
}
843844

fs/afs/fsclient.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1653,7 +1653,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server,
16531653
bp = call->request;
16541654
*bp++ = htonl(FSGIVEUPALLCALLBACKS);
16551655

1656-
call->server = afs_use_server(server, afs_server_trace_use_give_up_cb);
1656+
call->server = afs_use_server(server, false, afs_server_trace_use_give_up_cb);
16571657
afs_make_call(call, GFP_NOFS);
16581658
afs_wait_for_call_to_complete(call);
16591659
ret = call->error;
@@ -1760,7 +1760,7 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
17601760
return false;
17611761

17621762
call->key = key;
1763-
call->server = afs_use_server(server, afs_server_trace_use_get_caps);
1763+
call->server = afs_use_server(server, false, afs_server_trace_use_get_caps);
17641764
call->peer = rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer);
17651765
call->probe = afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps);
17661766
call->probe_index = addr_index;

fs/afs/internal.h

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -302,18 +302,11 @@ struct afs_net {
302302
* cell, but in practice, people create aliases and subsets and there's
303303
* no easy way to distinguish them.
304304
*/
305-
seqlock_t fs_lock; /* For fs_servers, fs_probe_*, fs_proc */
306-
struct rb_root fs_servers; /* afs_server (by server UUID or address) */
305+
seqlock_t fs_lock; /* For fs_probe_*, fs_proc */
307306
struct list_head fs_probe_fast; /* List of afs_server to probe at 30s intervals */
308307
struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */
309308
struct hlist_head fs_proc; /* procfs servers list */
310309

311-
struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */
312-
seqlock_t fs_addr_lock; /* For fs_addresses[46] */
313-
314-
struct work_struct fs_manager;
315-
struct timer_list fs_timer;
316-
317310
struct work_struct fs_prober;
318311
struct timer_list fs_probe_timer;
319312
atomic_t servers_outstanding;
@@ -409,7 +402,7 @@ struct afs_cell {
409402

410403
/* Active fileserver interaction state. */
411404
struct rb_root fs_servers; /* afs_server (by server UUID) */
412-
seqlock_t fs_lock; /* For fs_servers */
405+
struct rw_semaphore fs_lock; /* For fs_servers */
413406

414407
/* VL server list. */
415408
rwlock_t vl_servers_lock; /* Lock on vl_servers */
@@ -544,22 +537,22 @@ struct afs_server {
544537
};
545538

546539
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
547-
struct rb_node uuid_rb; /* Link in net->fs_servers */
548-
struct afs_server __rcu *uuid_next; /* Next server with same UUID */
549-
struct afs_server *uuid_prev; /* Previous server with same UUID */
550-
struct list_head probe_link; /* Link in net->fs_probe_list */
551-
struct hlist_node addr_link; /* Link in net->fs_addresses6 */
540+
struct rb_node uuid_rb; /* Link in cell->fs_servers */
541+
struct list_head probe_link; /* Link in net->fs_probe_* */
552542
struct hlist_node proc_link; /* Link in net->fs_proc */
553543
struct list_head volumes; /* RCU list of afs_server_entry objects */
554-
struct afs_server *gc_next; /* Next server in manager's list */
544+
struct work_struct destroyer; /* Work item to try and destroy a server */
545+
struct timer_list timer; /* Management timer */
555546
time64_t unuse_time; /* Time at which last unused */
556547
unsigned long flags;
557548
#define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */
558549
#define AFS_SERVER_FL_UPDATING 1
559550
#define AFS_SERVER_FL_NEEDS_UPDATE 2 /* Fileserver address list is out of date */
560-
#define AFS_SERVER_FL_NOT_READY 4 /* The record is not ready for use */
561-
#define AFS_SERVER_FL_NOT_FOUND 5 /* VL server says no such server */
562-
#define AFS_SERVER_FL_VL_FAIL 6 /* Failed to access VL server */
551+
#define AFS_SERVER_FL_UNCREATED 3 /* The record needs creating */
552+
#define AFS_SERVER_FL_CREATING 4 /* The record is being created */
553+
#define AFS_SERVER_FL_EXPIRED 5 /* The record has expired */
554+
#define AFS_SERVER_FL_NOT_FOUND 6 /* VL server says no such server */
555+
#define AFS_SERVER_FL_VL_FAIL 7 /* Failed to access VL server */
563556
#define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */
564557
#define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */
565558
#define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */
@@ -569,6 +562,7 @@ struct afs_server {
569562
atomic_t active; /* Active user count */
570563
u32 addr_version; /* Address list version */
571564
u16 service_id; /* Service ID we're using. */
565+
short create_error; /* Creation error */
572566
unsigned int rtt; /* Server's current RTT in uS */
573567
unsigned int debug_id; /* Debugging ID for traces */
574568

@@ -1513,19 +1507,29 @@ extern void __exit afs_clean_up_permit_cache(void);
15131507
extern spinlock_t afs_server_peer_lock;
15141508

15151509
struct afs_server *afs_find_server(const struct rxrpc_peer *peer);
1516-
extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *);
15171510
extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32);
15181511
extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace);
1519-
extern struct afs_server *afs_use_server(struct afs_server *, enum afs_server_trace);
1520-
extern void afs_unuse_server(struct afs_net *, struct afs_server *, enum afs_server_trace);
1521-
extern void afs_unuse_server_notime(struct afs_net *, struct afs_server *, enum afs_server_trace);
1512+
struct afs_server *afs_use_server(struct afs_server *server, bool activate,
1513+
enum afs_server_trace reason);
1514+
void afs_unuse_server(struct afs_net *net, struct afs_server *server,
1515+
enum afs_server_trace reason);
1516+
void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server,
1517+
enum afs_server_trace reason);
15221518
extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace);
1523-
extern void afs_manage_servers(struct work_struct *);
1524-
extern void afs_servers_timer(struct timer_list *);
1519+
void afs_purge_servers(struct afs_cell *cell);
15251520
extern void afs_fs_probe_timer(struct timer_list *);
1526-
extern void __net_exit afs_purge_servers(struct afs_net *);
1521+
void __net_exit afs_wait_for_servers(struct afs_net *net);
15271522
bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key);
15281523

1524+
static inline void afs_see_server(struct afs_server *server, enum afs_server_trace trace)
1525+
{
1526+
int r = refcount_read(&server->ref);
1527+
int a = atomic_read(&server->active);
1528+
1529+
trace_afs_server(server->debug_id, r, a, trace);
1530+
1531+
}
1532+
15291533
static inline void afs_inc_servers_outstanding(struct afs_net *net)
15301534
{
15311535
atomic_inc(&net->servers_outstanding);

fs/afs/main.c

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,16 +86,10 @@ static int __net_init afs_net_init(struct net *net_ns)
8686
INIT_HLIST_HEAD(&net->proc_cells);
8787

8888
seqlock_init(&net->fs_lock);
89-
net->fs_servers = RB_ROOT;
9089
INIT_LIST_HEAD(&net->fs_probe_fast);
9190
INIT_LIST_HEAD(&net->fs_probe_slow);
9291
INIT_HLIST_HEAD(&net->fs_proc);
9392

94-
INIT_HLIST_HEAD(&net->fs_addresses);
95-
seqlock_init(&net->fs_addr_lock);
96-
97-
INIT_WORK(&net->fs_manager, afs_manage_servers);
98-
timer_setup(&net->fs_timer, afs_servers_timer, 0);
9993
INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher);
10094
timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0);
10195
atomic_set(&net->servers_outstanding, 1);
@@ -131,7 +125,7 @@ static int __net_init afs_net_init(struct net *net_ns)
131125
net->live = false;
132126
afs_fs_probe_cleanup(net);
133127
afs_cell_purge(net);
134-
afs_purge_servers(net);
128+
afs_wait_for_servers(net);
135129
error_cell_init:
136130
net->live = false;
137131
afs_proc_cleanup(net);
@@ -153,7 +147,7 @@ static void __net_exit afs_net_exit(struct net *net_ns)
153147
net->live = false;
154148
afs_fs_probe_cleanup(net);
155149
afs_cell_purge(net);
156-
afs_purge_servers(net);
150+
afs_wait_for_servers(net);
157151
afs_close_socket(net);
158152
afs_proc_cleanup(net);
159153
afs_put_sysnames(net->sysnames);

0 commit comments

Comments
 (0)