Skip to content

Commit 893afb2

Browse files
Tomer Tayarogabbay
authored andcommitted
habanalabs: clear non-released encapsulated signals
Reserved encapsulated signals which were not released hold the context refcount, leading to a failure when killing the user process on device reset or device fini. Add the release of these left signals in the CS roll-back process. Signed-off-by: Tomer Tayar <[email protected]> Reviewed-by: Oded Gabbay <[email protected]> Signed-off-by: Oded Gabbay <[email protected]>
1 parent 1f61512 commit 893afb2

File tree

3 files changed

+71
-31
lines changed

3 files changed

+71
-31
lines changed

drivers/misc/habanalabs/common/command_submission.c

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -742,13 +742,11 @@ static void cs_do_release(struct kref *ref)
742742
*/
743743
if (hl_cs_cmpl->encaps_signals)
744744
kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount,
745-
hl_encaps_handle_do_release);
745+
hl_encaps_release_handle_and_put_ctx);
746746
}
747747

748-
if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
749-
&& cs->encaps_signals)
750-
kref_put(&cs->encaps_sig_hdl->refcount,
751-
hl_encaps_handle_do_release);
748+
if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) && cs->encaps_signals)
749+
kref_put(&cs->encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
752750

753751
out:
754752
/* Must be called before hl_ctx_put because inside we use ctx to get
@@ -1011,6 +1009,34 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
10111009
hl_complete_job(hdev, job);
10121010
}
10131011

1012+
/*
1013+
* release_reserved_encaps_signals() - release reserved encapsulated signals.
1014+
* @hdev: pointer to habanalabs device structure
1015+
*
1016+
* Release reserved encapsulated signals which weren't un-reserved, or for which a CS with
1017+
* encapsulated signals wasn't submitted and thus weren't released as part of CS roll-back.
1018+
* For these signals need also to put the refcount of the H/W SOB which was taken at the
1019+
* reservation.
1020+
*/
1021+
static void release_reserved_encaps_signals(struct hl_device *hdev)
1022+
{
1023+
struct hl_ctx *ctx = hl_get_compute_ctx(hdev);
1024+
struct hl_cs_encaps_sig_handle *handle;
1025+
struct hl_encaps_signals_mgr *mgr;
1026+
u32 id;
1027+
1028+
if (!ctx)
1029+
return;
1030+
1031+
mgr = &ctx->sig_mgr;
1032+
1033+
idr_for_each_entry(&mgr->handles, handle, id)
1034+
if (handle->cs_seq == ULLONG_MAX)
1035+
kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob_ctx);
1036+
1037+
hl_ctx_put(ctx);
1038+
}
1039+
10141040
void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
10151041
{
10161042
int i;
@@ -1039,6 +1065,8 @@ void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
10391065
}
10401066

10411067
force_complete_multi_cs(hdev);
1068+
1069+
release_reserved_encaps_signals(hdev);
10421070
}
10431071

10441072
static void
@@ -2001,6 +2029,8 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
20012029
*/
20022030
handle->pre_sob_val = prop->next_sob_val - handle->count;
20032031

2032+
handle->cs_seq = ULLONG_MAX;
2033+
20042034
*signals_count = prop->next_sob_val;
20052035
hdev->asic_funcs->hw_queues_unlock(hdev);
20062036

@@ -2350,10 +2380,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
23502380
/* We finished with the CS in this function, so put the ref */
23512381
cs_put(cs);
23522382
free_cs_chunk_array:
2353-
if (!wait_cs_submitted && cs_encaps_signals && handle_found &&
2354-
is_wait_cs)
2355-
kref_put(&encaps_sig_hdl->refcount,
2356-
hl_encaps_handle_do_release);
2383+
if (!wait_cs_submitted && cs_encaps_signals && handle_found && is_wait_cs)
2384+
kref_put(&encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
23572385
kfree(cs_chunk_array);
23582386
out:
23592387
return rc;

drivers/misc/habanalabs/common/context.c

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,37 +9,46 @@
99

1010
#include <linux/slab.h>
1111

12-
void hl_encaps_handle_do_release(struct kref *ref)
12+
static void encaps_handle_do_release(struct hl_cs_encaps_sig_handle *handle, bool put_hw_sob,
13+
bool put_ctx)
1314
{
14-
struct hl_cs_encaps_sig_handle *handle =
15-
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
1615
struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;
1716

17+
if (put_hw_sob)
18+
hw_sob_put(handle->hw_sob);
19+
1820
spin_lock(&mgr->lock);
1921
idr_remove(&mgr->handles, handle->id);
2022
spin_unlock(&mgr->lock);
2123

22-
hl_ctx_put(handle->ctx);
24+
if (put_ctx)
25+
hl_ctx_put(handle->ctx);
26+
2327
kfree(handle);
2428
}
2529

26-
static void hl_encaps_handle_do_release_sob(struct kref *ref)
30+
void hl_encaps_release_handle_and_put_ctx(struct kref *ref)
2731
{
2832
struct hl_cs_encaps_sig_handle *handle =
29-
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
30-
struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;
33+
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
3134

32-
/* if we're here, then there was a signals reservation but cs with
33-
* encaps signals wasn't submitted, so need to put refcount
34-
* to hw_sob taken at the reservation.
35-
*/
36-
hw_sob_put(handle->hw_sob);
35+
encaps_handle_do_release(handle, false, true);
36+
}
3737

38-
spin_lock(&mgr->lock);
39-
idr_remove(&mgr->handles, handle->id);
40-
spin_unlock(&mgr->lock);
38+
static void hl_encaps_release_handle_and_put_sob(struct kref *ref)
39+
{
40+
struct hl_cs_encaps_sig_handle *handle =
41+
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
4142

42-
kfree(handle);
43+
encaps_handle_do_release(handle, true, false);
44+
}
45+
46+
void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref)
47+
{
48+
struct hl_cs_encaps_sig_handle *handle =
49+
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
50+
51+
encaps_handle_do_release(handle, true, true);
4352
}
4453

4554
static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
@@ -48,20 +57,22 @@ static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
4857
idr_init(&mgr->handles);
4958
}
5059

51-
static void hl_encaps_sig_mgr_fini(struct hl_device *hdev,
52-
struct hl_encaps_signals_mgr *mgr)
60+
static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, struct hl_encaps_signals_mgr *mgr)
5361
{
5462
struct hl_cs_encaps_sig_handle *handle;
5563
struct idr *idp;
5664
u32 id;
5765

5866
idp = &mgr->handles;
5967

68+
/* The IDR is expected to be empty at this stage, because any left signal should have been
69+
* released as part of CS roll-back.
70+
*/
6071
if (!idr_is_empty(idp)) {
61-
dev_warn(hdev->dev, "device released while some encaps signals handles are still allocated\n");
72+
dev_warn(hdev->dev,
73+
"device released while some encaps signals handles are still allocated\n");
6274
idr_for_each_entry(idp, handle, id)
63-
kref_put(&handle->refcount,
64-
hl_encaps_handle_do_release_sob);
75+
kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob);
6576
}
6677

6778
idr_destroy(&mgr->handles);

drivers/misc/habanalabs/common/habanalabs.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3775,7 +3775,8 @@ void hl_sysfs_add_dev_vrm_attr(struct hl_device *hdev, struct attribute_group *d
37753775

37763776
void hw_sob_get(struct hl_hw_sob *hw_sob);
37773777
void hw_sob_put(struct hl_hw_sob *hw_sob);
3778-
void hl_encaps_handle_do_release(struct kref *ref);
3778+
void hl_encaps_release_handle_and_put_ctx(struct kref *ref);
3779+
void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref);
37793780
void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev,
37803781
struct hl_cs *cs, struct hl_cs_job *job,
37813782
struct hl_cs_compl *cs_cmpl);

0 commit comments

Comments
 (0)