[nrf fromtree] Bluetooth: Host: Don't call user callback from TX thread

alwa-nordic · KyraLengfeld · commit c1cbb7879750 · 2025-12-08T09:44:15.000+01:00
ATT is invoking user callbacks in its net_buf destroy function. It is common practice that these callbacks can block on bt_hci_cmd_alloc(). This is a deadlock when the net_buf_unref() happens inside the HCI driver, invoked from tx_processor. Blocking callbacks like this appear in our own samples. See further down about how this problem was detected. tx_processor not protect against blocking callbacks so it is de-facto forbidden. The Host should not equip net_bufs with dangerous destroy callbacks. This commit makes ATT defer its net_buf destruction and user callback invocation to the system workqueue, so that net_buf_unref is safe to call from non-blocking threads. In the case of the deadlock, the net_buf_unref() was below the tx_processor in the call stack, which (at the time of this commit) is on the system work queue, so defering it to the system work queue is preserving the existing behavior. Future improvement may be to allow the user to provide their own workqueue for ATT callbacks. This deadlock was detected because the following test was failing while moving tx_processor to the bt_taskq: tests/bsim/bluetooth/ll/throughput/tests_scripts/gatt_write.sh The above test has an ATT callback `write_cmd_cb` invokes `bt_conn_le_param_update` can block waiting for `tx_processor`. The reason it was not failing while tx_processor was on the system work queue is that the GATT API has a special non-blocking behavior when called from the system work queue. Signed-off-by: Aleksander Wasaznik <aleksander.wasaznik@nordicsemi.no> (cherry picked from commit 6889042) Signed-off-by: Kyra Lengfeld <kyra.lengfeld@nordicsemi.no>
diff --git a/subsys/bluetooth/host/att.c b/subsys/bluetooth/host/att.c
@@ -25,6 +25,7 @@
 #include <zephyr/kernel/thread.h>
 #include <zephyr/logging/log.h>
 #include <zephyr/net_buf.h>
+#include <zephyr/spinlock.h>
 #include <zephyr/sys/__assert.h>
 #include <zephyr/sys/atomic.h>
 #include <zephyr/sys/byteorder.h>
@@ -248,10 +249,37 @@ const char *bt_att_err_to_str(uint8_t att_err)
 }
 #endif /* CONFIG_BT_ATT_ERR_TO_STR */
 
-static void att_tx_destroy(struct net_buf *buf)
+static void att_tx_destroy_work_handler(struct k_work *work);
+static K_WORK_DEFINE(att_tx_destroy_work, att_tx_destroy_work_handler);
+static struct k_spinlock tx_destroy_queue_lock;
+static sys_slist_t tx_destroy_queue = SYS_SLIST_STATIC_INIT(&tx_destroy_queue);
+
+static void att_tx_destroy_work_handler(struct k_work *work)
 {
-	struct bt_att_tx_meta_data *p_meta = att_get_tx_meta_data(buf);
+	struct net_buf *buf;
+	struct bt_att_tx_meta_data *p_meta;
 	struct bt_att_tx_meta_data meta;
+	sys_snode_t *buf_node;
+	k_spinlock_key_t key;
+	bool resubmit;
+
+	key = k_spin_lock(&tx_destroy_queue_lock);
+	buf_node = sys_slist_get(&tx_destroy_queue);
+	/* If there are more items in the queue, those likely have
+	 * been there before this handler started running and
+	 * coalesced into a single work submission, so we need to
+	 * resubmit.
+	 */
+	resubmit = !sys_slist_is_empty(&tx_destroy_queue);
+	k_spin_unlock(&tx_destroy_queue_lock, key);
+
+	/* Spurious wakeups can occur in with some thread interleavings. */
+	if (buf_node == NULL) {
+		return;
+	}
+
+	buf = CONTAINER_OF(buf_node, struct net_buf, node);
+	p_meta = att_get_tx_meta_data(buf);
 
 	LOG_DBG("%p", buf);
 
@@ -266,8 +294,8 @@ static void att_tx_destroy(struct net_buf *buf)
 	 */
 	memset(p_meta, 0x00, sizeof(*p_meta));
 
-	/* After this point, p_meta doesn't belong to us.
-	 * The user data will be memset to 0 on allocation.
+	/* After this point, p_meta doesn't belong to us. The user data will
+	 * be memset to 0 on allocation.
 	 */
 	net_buf_destroy(buf);
 
@@ -278,6 +306,51 @@ static void att_tx_destroy(struct net_buf *buf)
 	if (meta.opcode != 0) {
 		att_on_sent_cb(&meta);
 	}
+
+	/* We resubmit this work instead of looping to allow other work on
+	 * the work queue to run.
+	 */
+	if (resubmit) {
+		int err = k_work_submit_to_queue(NULL, work);
+
+		if (err < 0) {
+			LOG_ERR("Failed to re-submit %s: %d", __func__, err);
+			k_oops();
+		}
+	}
+}
+
+static void att_tx_destroy(struct net_buf *buf)
+{
+	int err;
+	k_spinlock_key_t key;
+
+	/* We need to invoke att_on_sent_cb, which may block. We
+	 * don't want to block in a net buf destroy callback, so we
+	 * defer to a sensible workqueue.
+	 *
+	 * bt_workq cannot be used because it currently forms a
+	 * deadlock with att_pool: bt_workq -> bt_att_recv ->
+	 * send_err_rsp waits for att pool.
+	 *
+	 * We use the system work queue to preserve earlier
+	 * behavior. The tx_processor used to run on the system work
+	 * queue, and it could end up here: tx_processor ->
+	 * bt_hci_send -> net_buf_unref.
+	 *
+	 * A possible alternative is tx_notify_workqueue_get() since
+	 * this workqueue is processing similar "completion" events.
+	 */
+	key = k_spin_lock(&tx_destroy_queue_lock);
+	sys_slist_append(&tx_destroy_queue, &buf->node);
+	k_spin_unlock(&tx_destroy_queue_lock, key);
+
+	err = k_work_submit(&att_tx_destroy_work);
+	if (err < 0) {
+		LOG_ERR("Failed to submit att_tx_destroy_work: %d", err);
+		k_oops();
+	}
+	/* Continues in att_tx_destroy_work_handler() */
 }
 
 NET_BUF_POOL_DEFINE(att_pool, CONFIG_BT_ATT_TX_COUNT,