Skip to content

Commit ae27e88

Browse files
committed
Merge tag 'misc-habanalabs-next-2022-11-23' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into char-misc-next
Oded writes: This tag contains habanalabs driver changes for v6.2: - New feature of graceful hard-reset. Instead of immediately killing the user-process when a command submission times out, we wait a bit and give the user-process notification and let it try to close things gracefully, with the ability to retrieve debug information. - Enhance the EventFD mechanism. Add new events such as access to illegal address (RAZWI), page fault, device unavailable. In addition, change the event workqueue to be handled in a single-threaded workqueue. - Allow the control device to work during reset of the ASIC, to enable monitoring applications to continue getting the data. - Add handling for Gaudi2 with PCI revision 2. - Reduce severity of prints due to power/thermal events. - Change how we use the h/w to perform memory scrubbing in Gaudi2. - Multiple bug fixes, refactors and renames. * tag 'misc-habanalabs-next-2022-11-23' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux: (63 commits) habanalabs: fix VA range calculation habanalabs: fail driver load if EEPROM errors detected habanalabs: make print of engines idle mask more readable habanalabs: clear non-released encapsulated signals habanalabs: don't put context in hl_encaps_handle_do_release_sob() habanalabs: print context refcount value if hard reset fails habanalabs: add RMWREG32_SHIFTED to set a val within a mask habanalabs: fix rc when new CPUCP opcodes are not supported habanalabs/gaudi2: added memset for the cq_size register habanalabs: added return value check for hl_fw_dynamic_send_clear_cmd() habanalabs: increase the size of busy engines mask habanalabs/gaudi2: change memory scrub mechanism habanalabs: extend process wait timeout in device fine habanalabs: check schedule_hard_reset correctly habanalabs: reset device if still in use when released habanalabs/gaudi2: return to reset upon SM SEI BRESP error habanalabs/gaudi2: don't enable entries in the MSIX_GW table habanalabs/gaudi2: remove redundant firmware version check habanalabs/gaudi: fix print for firmware-alive event habanalabs: fix print for out-of-sync and pkt-failure events ...
2 parents 449ef8f + 19a17a9 commit ae27e88

File tree

21 files changed

+1267
-529
lines changed

21 files changed

+1267
-529
lines changed

Documentation/ABI/testing/debugfs-driver-habanalabs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,13 @@ Description: Enables the root user to set the device to specific state.
9191
Valid values are "disable", "enable", "suspend", "resume".
9292
User can read this property to see the valid values
9393

94+
What: /sys/kernel/debug/habanalabs/hl<n>/device_release_watchdog_timeout
95+
Date: Oct 2022
96+
KernelVersion: 6.2
97+
98+
Description: The watchdog timeout value in seconds for a device relese upon
99+
certain error cases, after which the device is reset.
100+
94101
What: /sys/kernel/debug/habanalabs/hl<n>/dma_size
95102
Date: Apr 2021
96103
KernelVersion: 5.13

drivers/misc/habanalabs/common/command_submission.c

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -742,13 +742,11 @@ static void cs_do_release(struct kref *ref)
742742
*/
743743
if (hl_cs_cmpl->encaps_signals)
744744
kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount,
745-
hl_encaps_handle_do_release);
745+
hl_encaps_release_handle_and_put_ctx);
746746
}
747747

748-
if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
749-
&& cs->encaps_signals)
750-
kref_put(&cs->encaps_sig_hdl->refcount,
751-
hl_encaps_handle_do_release);
748+
if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) && cs->encaps_signals)
749+
kref_put(&cs->encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
752750

753751
out:
754752
/* Must be called before hl_ctx_put because inside we use ctx to get
@@ -798,7 +796,7 @@ static void cs_do_release(struct kref *ref)
798796
static void cs_timedout(struct work_struct *work)
799797
{
800798
struct hl_device *hdev;
801-
u64 event_mask;
799+
u64 event_mask = 0x0;
802800
int rc;
803801
struct hl_cs *cs = container_of(work, struct hl_cs,
804802
work_tdr.work);
@@ -830,11 +828,7 @@ static void cs_timedout(struct work_struct *work)
830828
if (rc) {
831829
hdev->captured_err_info.cs_timeout.timestamp = ktime_get();
832830
hdev->captured_err_info.cs_timeout.seq = cs->sequence;
833-
834-
event_mask = device_reset ? (HL_NOTIFIER_EVENT_CS_TIMEOUT |
835-
HL_NOTIFIER_EVENT_DEVICE_RESET) : HL_NOTIFIER_EVENT_CS_TIMEOUT;
836-
837-
hl_notifier_event_send_all(hdev, event_mask);
831+
event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
838832
}
839833

840834
switch (cs->type) {
@@ -869,8 +863,12 @@ static void cs_timedout(struct work_struct *work)
869863

870864
cs_put(cs);
871865

872-
if (device_reset)
873-
hl_device_reset(hdev, HL_DRV_RESET_TDR);
866+
if (device_reset) {
867+
event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
868+
hl_device_cond_reset(hdev, HL_DRV_RESET_TDR, event_mask);
869+
} else if (event_mask) {
870+
hl_notifier_event_send_all(hdev, event_mask);
871+
}
874872
}
875873

876874
static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
@@ -1011,6 +1009,34 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
10111009
hl_complete_job(hdev, job);
10121010
}
10131011

1012+
/*
1013+
* release_reserved_encaps_signals() - release reserved encapsulated signals.
1014+
* @hdev: pointer to habanalabs device structure
1015+
*
1016+
* Release reserved encapsulated signals which weren't un-reserved, or for which a CS with
1017+
* encapsulated signals wasn't submitted and thus weren't released as part of CS roll-back.
1018+
* For these signals need also to put the refcount of the H/W SOB which was taken at the
1019+
* reservation.
1020+
*/
1021+
static void release_reserved_encaps_signals(struct hl_device *hdev)
1022+
{
1023+
struct hl_ctx *ctx = hl_get_compute_ctx(hdev);
1024+
struct hl_cs_encaps_sig_handle *handle;
1025+
struct hl_encaps_signals_mgr *mgr;
1026+
u32 id;
1027+
1028+
if (!ctx)
1029+
return;
1030+
1031+
mgr = &ctx->sig_mgr;
1032+
1033+
idr_for_each_entry(&mgr->handles, handle, id)
1034+
if (handle->cs_seq == ULLONG_MAX)
1035+
kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob_ctx);
1036+
1037+
hl_ctx_put(ctx);
1038+
}
1039+
10141040
void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
10151041
{
10161042
int i;
@@ -1039,6 +1065,8 @@ void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
10391065
}
10401066

10411067
force_complete_multi_cs(hdev);
1068+
1069+
release_reserved_encaps_signals(hdev);
10421070
}
10431071

10441072
static void
@@ -2001,6 +2029,8 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
20012029
*/
20022030
handle->pre_sob_val = prop->next_sob_val - handle->count;
20032031

2032+
handle->cs_seq = ULLONG_MAX;
2033+
20042034
*signals_count = prop->next_sob_val;
20052035
hdev->asic_funcs->hw_queues_unlock(hdev);
20062036

@@ -2350,10 +2380,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
23502380
/* We finished with the CS in this function, so put the ref */
23512381
cs_put(cs);
23522382
free_cs_chunk_array:
2353-
if (!wait_cs_submitted && cs_encaps_signals && handle_found &&
2354-
is_wait_cs)
2355-
kref_put(&encaps_sig_hdl->refcount,
2356-
hl_encaps_handle_do_release);
2383+
if (!wait_cs_submitted && cs_encaps_signals && handle_found && is_wait_cs)
2384+
kref_put(&encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
23572385
kfree(cs_chunk_array);
23582386
out:
23592387
return rc;

drivers/misc/habanalabs/common/context.c

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,38 +9,46 @@
99

1010
#include <linux/slab.h>
1111

12-
void hl_encaps_handle_do_release(struct kref *ref)
12+
static void encaps_handle_do_release(struct hl_cs_encaps_sig_handle *handle, bool put_hw_sob,
13+
bool put_ctx)
1314
{
14-
struct hl_cs_encaps_sig_handle *handle =
15-
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
1615
struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;
1716

17+
if (put_hw_sob)
18+
hw_sob_put(handle->hw_sob);
19+
1820
spin_lock(&mgr->lock);
1921
idr_remove(&mgr->handles, handle->id);
2022
spin_unlock(&mgr->lock);
2123

22-
hl_ctx_put(handle->ctx);
24+
if (put_ctx)
25+
hl_ctx_put(handle->ctx);
26+
2327
kfree(handle);
2428
}
2529

26-
static void hl_encaps_handle_do_release_sob(struct kref *ref)
30+
void hl_encaps_release_handle_and_put_ctx(struct kref *ref)
2731
{
2832
struct hl_cs_encaps_sig_handle *handle =
29-
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
30-
struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;
33+
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
3134

32-
/* if we're here, then there was a signals reservation but cs with
33-
* encaps signals wasn't submitted, so need to put refcount
34-
* to hw_sob taken at the reservation.
35-
*/
36-
hw_sob_put(handle->hw_sob);
35+
encaps_handle_do_release(handle, false, true);
36+
}
3737

38-
spin_lock(&mgr->lock);
39-
idr_remove(&mgr->handles, handle->id);
40-
spin_unlock(&mgr->lock);
38+
static void hl_encaps_release_handle_and_put_sob(struct kref *ref)
39+
{
40+
struct hl_cs_encaps_sig_handle *handle =
41+
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
4142

42-
hl_ctx_put(handle->ctx);
43-
kfree(handle);
43+
encaps_handle_do_release(handle, true, false);
44+
}
45+
46+
void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref)
47+
{
48+
struct hl_cs_encaps_sig_handle *handle =
49+
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
50+
51+
encaps_handle_do_release(handle, true, true);
4452
}
4553

4654
static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
@@ -49,20 +57,22 @@ static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
4957
idr_init(&mgr->handles);
5058
}
5159

52-
static void hl_encaps_sig_mgr_fini(struct hl_device *hdev,
53-
struct hl_encaps_signals_mgr *mgr)
60+
static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, struct hl_encaps_signals_mgr *mgr)
5461
{
5562
struct hl_cs_encaps_sig_handle *handle;
5663
struct idr *idp;
5764
u32 id;
5865

5966
idp = &mgr->handles;
6067

68+
/* The IDR is expected to be empty at this stage, because any left signal should have been
69+
* released as part of CS roll-back.
70+
*/
6171
if (!idr_is_empty(idp)) {
62-
dev_warn(hdev->dev, "device released while some encaps signals handles are still allocated\n");
72+
dev_warn(hdev->dev,
73+
"device released while some encaps signals handles are still allocated\n");
6374
idr_for_each_entry(idp, handle, id)
64-
kref_put(&handle->refcount,
65-
hl_encaps_handle_do_release_sob);
75+
kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob);
6676
}
6777

6878
idr_destroy(&mgr->handles);

drivers/misc/habanalabs/common/debugfs.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1769,6 +1769,11 @@ void hl_debugfs_add_device(struct hl_device *hdev)
17691769
dev_entry,
17701770
&hl_timeout_locked_fops);
17711771

1772+
debugfs_create_u32("device_release_watchdog_timeout",
1773+
0644,
1774+
dev_entry->root,
1775+
&hdev->device_release_watchdog_timeout_sec);
1776+
17721777
for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
17731778
debugfs_create_file(hl_debugfs_list[i].name,
17741779
0444,

0 commit comments

Comments
 (0)