Skip to content

Commit 485d65e

Browse files
Akiva Goldbergerkuba-moo
authored andcommitted
net/mlx5: Add a timeout to acquire the command queue semaphore
Prevent forced completion handling on an entry that has not yet been assigned an index, causing an out of bounds access on idx = -22. Instead of waiting indefinitely for the sem, blocking flow now waits for index to be allocated or a sem acquisition timeout before beginning the timer for FW completion. Kernel log example: mlx5_core 0000:06:00.0: wait_func_handle_exec_timeout:1128:(pid 185911): cmd[-22]: CREATE_UCTX(0xa04) No done completion Fixes: 8e715cd ("net/mlx5: Set command entry semaphore up once got index free") Signed-off-by: Akiva Goldberger <[email protected]> Reviewed-by: Moshe Shemesh <[email protected]> Signed-off-by: Tariq Toukan <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 0f06228 commit 485d65e

File tree

2 files changed

+33
-9
lines changed
  • drivers/net/ethernet/mellanox/mlx5/core
  • include/linux/mlx5

2 files changed

+33
-9
lines changed

drivers/net/ethernet/mellanox/mlx5/core/cmd.c

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -969,19 +969,32 @@ static void cmd_work_handler(struct work_struct *work)
969969
bool poll_cmd = ent->polling;
970970
struct mlx5_cmd_layout *lay;
971971
struct mlx5_core_dev *dev;
972-
unsigned long cb_timeout;
973-
struct semaphore *sem;
972+
unsigned long timeout;
974973
unsigned long flags;
975974
int alloc_ret;
976975
int cmd_mode;
977976

977+
complete(&ent->handling);
978+
978979
dev = container_of(cmd, struct mlx5_core_dev, cmd);
979-
cb_timeout = msecs_to_jiffies(mlx5_tout_ms(dev, CMD));
980+
timeout = msecs_to_jiffies(mlx5_tout_ms(dev, CMD));
980981

981-
complete(&ent->handling);
982-
sem = ent->page_queue ? &cmd->vars.pages_sem : &cmd->vars.sem;
983-
down(sem);
984982
if (!ent->page_queue) {
983+
if (down_timeout(&cmd->vars.sem, timeout)) {
984+
mlx5_core_warn(dev, "%s(0x%x) timed out while waiting for a slot.\n",
985+
mlx5_command_str(ent->op), ent->op);
986+
if (ent->callback) {
987+
ent->callback(-EBUSY, ent->context);
988+
mlx5_free_cmd_msg(dev, ent->out);
989+
free_msg(dev, ent->in);
990+
cmd_ent_put(ent);
991+
} else {
992+
ent->ret = -EBUSY;
993+
complete(&ent->done);
994+
}
995+
complete(&ent->slotted);
996+
return;
997+
}
985998
alloc_ret = cmd_alloc_index(cmd, ent);
986999
if (alloc_ret < 0) {
9871000
mlx5_core_err_rl(dev, "failed to allocate command entry\n");
@@ -994,17 +1007,20 @@ static void cmd_work_handler(struct work_struct *work)
9941007
ent->ret = -EAGAIN;
9951008
complete(&ent->done);
9961009
}
997-
up(sem);
1010+
up(&cmd->vars.sem);
9981011
return;
9991012
}
10001013
} else {
1014+
down(&cmd->vars.pages_sem);
10011015
ent->idx = cmd->vars.max_reg_cmds;
10021016
spin_lock_irqsave(&cmd->alloc_lock, flags);
10031017
clear_bit(ent->idx, &cmd->vars.bitmask);
10041018
cmd->ent_arr[ent->idx] = ent;
10051019
spin_unlock_irqrestore(&cmd->alloc_lock, flags);
10061020
}
10071021

1022+
complete(&ent->slotted);
1023+
10081024
lay = get_inst(cmd, ent->idx);
10091025
ent->lay = lay;
10101026
memset(lay, 0, sizeof(*lay));
@@ -1023,7 +1039,7 @@ static void cmd_work_handler(struct work_struct *work)
10231039
ent->ts1 = ktime_get_ns();
10241040
cmd_mode = cmd->mode;
10251041

1026-
if (ent->callback && schedule_delayed_work(&ent->cb_timeout_work, cb_timeout))
1042+
if (ent->callback && schedule_delayed_work(&ent->cb_timeout_work, timeout))
10271043
cmd_ent_get(ent);
10281044
set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state);
10291045

@@ -1143,6 +1159,9 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
11431159
ent->ret = -ECANCELED;
11441160
goto out_err;
11451161
}
1162+
1163+
wait_for_completion(&ent->slotted);
1164+
11461165
if (cmd->mode == CMD_MODE_POLLING || ent->polling)
11471166
wait_for_completion(&ent->done);
11481167
else if (!wait_for_completion_timeout(&ent->done, timeout))
@@ -1157,6 +1176,9 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
11571176
} else if (err == -ECANCELED) {
11581177
mlx5_core_warn(dev, "%s(0x%x) canceled on out of queue timeout.\n",
11591178
mlx5_command_str(ent->op), ent->op);
1179+
} else if (err == -EBUSY) {
1180+
mlx5_core_warn(dev, "%s(0x%x) timeout while waiting for command semaphore.\n",
1181+
mlx5_command_str(ent->op), ent->op);
11601182
}
11611183
mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n",
11621184
err, deliv_status_to_str(ent->status), ent->status);
@@ -1208,6 +1230,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
12081230
ent->polling = force_polling;
12091231

12101232
init_completion(&ent->handling);
1233+
init_completion(&ent->slotted);
12111234
if (!callback)
12121235
init_completion(&ent->done);
12131236

@@ -1225,7 +1248,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
12251248
return 0; /* mlx5_cmd_comp_handler() will put(ent) */
12261249

12271250
err = wait_func(dev, ent);
1228-
if (err == -ETIMEDOUT || err == -ECANCELED)
1251+
if (err == -ETIMEDOUT || err == -ECANCELED || err == -EBUSY)
12291252
goto out_free;
12301253

12311254
ds = ent->ts2 - ent->ts1;

include/linux/mlx5/driver.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -862,6 +862,7 @@ struct mlx5_cmd_work_ent {
862862
void *context;
863863
int idx;
864864
struct completion handling;
865+
struct completion slotted;
865866
struct completion done;
866867
struct mlx5_cmd *cmd;
867868
struct work_struct work;

0 commit comments

Comments
 (0)