Skip to content

Commit adda800

Browse files
committed
accel/habanalabs: print max timeout value on CS stuck
If a workload got stuck, we print an error to the kernel log about it. Add to that print the configured max timeout value, as that value is not fixed between ASICs and in addition it can be configured using a kernel module parameter. Signed-off-by: Oded Gabbay <[email protected]> Reviewed-by: Ofir Bitton <[email protected]>
1 parent dcfce96 commit adda800

File tree

1 file changed

+15
-11
lines changed

1 file changed

+15
-11
lines changed

drivers/accel/habanalabs/common/command_submission.c

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -804,12 +804,14 @@ static void cs_do_release(struct kref *ref)
804804

805805
static void cs_timedout(struct work_struct *work)
806806
{
807+
struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work);
808+
bool skip_reset_on_timeout, device_reset = false;
807809
struct hl_device *hdev;
808810
u64 event_mask = 0x0;
811+
uint timeout_sec;
809812
int rc;
810-
struct hl_cs *cs = container_of(work, struct hl_cs,
811-
work_tdr.work);
812-
bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false;
813+
814+
skip_reset_on_timeout = cs->skip_reset_on_timeout;
813815

814816
rc = cs_get_unless_zero(cs);
815817
if (!rc)
@@ -840,29 +842,31 @@ static void cs_timedout(struct work_struct *work)
840842
event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
841843
}
842844

845+
timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000;
846+
843847
switch (cs->type) {
844848
case CS_TYPE_SIGNAL:
845849
dev_err(hdev->dev,
846-
"Signal command submission %llu has not finished in time!\n",
847-
cs->sequence);
850+
"Signal command submission %llu has not finished in %u seconds!\n",
851+
cs->sequence, timeout_sec);
848852
break;
849853

850854
case CS_TYPE_WAIT:
851855
dev_err(hdev->dev,
852-
"Wait command submission %llu has not finished in time!\n",
853-
cs->sequence);
856+
"Wait command submission %llu has not finished in %u seconds!\n",
857+
cs->sequence, timeout_sec);
854858
break;
855859

856860
case CS_TYPE_COLLECTIVE_WAIT:
857861
dev_err(hdev->dev,
858-
"Collective Wait command submission %llu has not finished in time!\n",
859-
cs->sequence);
862+
"Collective Wait command submission %llu has not finished in %u seconds!\n",
863+
cs->sequence, timeout_sec);
860864
break;
861865

862866
default:
863867
dev_err(hdev->dev,
864-
"Command submission %llu has not finished in time!\n",
865-
cs->sequence);
868+
"Command submission %llu has not finished in %u seconds!\n",
869+
cs->sequence, timeout_sec);
866870
break;
867871
}
868872

0 commit comments

Comments
 (0)