Skip to content

Commit 8d9aa98

Browse files
committed
habanalabs: add support for f/w reset
When the f/w runs in secured mode, it can reset the ASIC when certain events occur. In unsecured mode, the driver asks the f/w to reset the ASIC for those events. We need to perform the entire reset procedure but without accessing the ASIC. i.e. without halting the engines and without sending messages to the f/w. Signed-off-by: Oded Gabbay <[email protected]>
1 parent 56e753d commit 8d9aa98

File tree

5 files changed

+61
-35
lines changed

5 files changed

+61
-35
lines changed

drivers/misc/habanalabs/common/device.c

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -311,9 +311,15 @@ static void device_hard_reset_pending(struct work_struct *work)
311311
container_of(work, struct hl_device_reset_work,
312312
reset_work.work);
313313
struct hl_device *hdev = device_reset_work->hdev;
314+
u32 flags;
314315
int rc;
315316

316-
rc = hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD);
317+
flags = HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD;
318+
319+
if (device_reset_work->fw_reset)
320+
flags |= HL_RESET_FW;
321+
322+
rc = hl_device_reset(hdev, flags);
317323
if ((rc == -EBUSY) && !hdev->device_fini_pending) {
318324
dev_info(hdev->dev,
319325
"Could not reset device. will try again in %u seconds",
@@ -702,7 +708,7 @@ static void take_release_locks(struct hl_device *hdev)
702708
mutex_unlock(&hdev->fpriv_list_lock);
703709
}
704710

705-
static void cleanup_resources(struct hl_device *hdev, bool hard_reset)
711+
static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset)
706712
{
707713
if (hard_reset)
708714
device_late_fini(hdev);
@@ -712,7 +718,7 @@ static void cleanup_resources(struct hl_device *hdev, bool hard_reset)
712718
* completions from H/W and we won't have any accesses from the
713719
* H/W to the host machine
714720
*/
715-
hdev->asic_funcs->halt_engines(hdev, hard_reset);
721+
hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset);
716722

717723
/* Go over all the queues, release all CS and their jobs */
718724
hl_cs_rollback_all(hdev);
@@ -922,7 +928,7 @@ static void device_disable_open_processes(struct hl_device *hdev)
922928
int hl_device_reset(struct hl_device *hdev, u32 flags)
923929
{
924930
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
925-
bool hard_reset, from_hard_reset_thread, hard_instead_soft = false;
931+
bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false;
926932
int i, rc;
927933

928934
if (!hdev->init_done) {
@@ -933,6 +939,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
933939

934940
hard_reset = !!(flags & HL_RESET_HARD);
935941
from_hard_reset_thread = !!(flags & HL_RESET_FROM_RESET_THREAD);
942+
fw_reset = !!(flags & HL_RESET_FW);
936943

937944
if (!hard_reset && !hdev->supports_soft_reset) {
938945
hard_instead_soft = true;
@@ -984,11 +991,13 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
984991
else
985992
hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
986993

987-
/*
988-
* if reset is due to heartbeat, device CPU is no responsive in
989-
* which case no point sending PCI disable message to it
994+
/* If reset is due to heartbeat, device CPU is no responsive in
995+
* which case no point sending PCI disable message to it.
996+
*
997+
* If F/W is performing the reset, no need to send it a message to disable
998+
* PCI access
990999
*/
991-
if (hard_reset && !(flags & HL_RESET_HEARTBEAT)) {
1000+
if (hard_reset && !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) {
9921001
/* Disable PCI access from device F/W so he won't send
9931002
* us additional interrupts. We disable MSI/MSI-X at
9941003
* the halt_engines function and we can't have the F/W
@@ -1018,6 +1027,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
10181027

10191028
hdev->process_kill_trial_cnt = 0;
10201029

1030+
hdev->device_reset_work.fw_reset = fw_reset;
1031+
10211032
/*
10221033
* Because the reset function can't run from heartbeat work,
10231034
* we need to call the reset function from a dedicated work.
@@ -1028,7 +1039,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
10281039
return 0;
10291040
}
10301041

1031-
cleanup_resources(hdev, hard_reset);
1042+
cleanup_resources(hdev, hard_reset, fw_reset);
10321043

10331044
kill_processes:
10341045
if (hard_reset) {
@@ -1062,7 +1073,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
10621073
}
10631074

10641075
/* Reset the H/W. It will be in idle state after this returns */
1065-
hdev->asic_funcs->hw_fini(hdev, hard_reset);
1076+
hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
10661077

10671078
if (hard_reset) {
10681079
hdev->fw_loader.linux_loaded = false;
@@ -1587,7 +1598,7 @@ void hl_device_fini(struct hl_device *hdev)
15871598

15881599
hl_hwmon_fini(hdev);
15891600

1590-
cleanup_resources(hdev, true);
1601+
cleanup_resources(hdev, true, false);
15911602

15921603
/* Kill processes here after CS rollback. This is because the process
15931604
* can't really exit until all its CSs are done, which is what we
@@ -1606,7 +1617,7 @@ void hl_device_fini(struct hl_device *hdev)
16061617
hl_cb_pool_fini(hdev);
16071618

16081619
/* Reset the H/W. It will be in idle state after this returns */
1609-
hdev->asic_funcs->hw_fini(hdev, true);
1620+
hdev->asic_funcs->hw_fini(hdev, true, false);
16101621

16111622
hdev->fw_loader.linux_loaded = false;
16121623

drivers/misc/habanalabs/common/habanalabs.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,12 +128,17 @@ enum hl_mmu_page_table_location {
128128
*
129129
* - HL_RESET_DEVICE_RELEASE
130130
* Set if reset is due to device release
131+
*
132+
* - HL_RESET_FW
133+
* F/W will perform the reset. No need to ask it to reset the device. This is relevant
134+
* only when running with secured f/w
131135
*/
132136
#define HL_RESET_HARD (1 << 0)
133137
#define HL_RESET_FROM_RESET_THREAD (1 << 1)
134138
#define HL_RESET_HEARTBEAT (1 << 2)
135139
#define HL_RESET_TDR (1 << 3)
136140
#define HL_RESET_DEVICE_RELEASE (1 << 4)
141+
#define HL_RESET_FW (1 << 5)
137142

138143
#define HL_MAX_SOBS_PER_MONITOR 8
139144

@@ -1170,8 +1175,8 @@ struct hl_asic_funcs {
11701175
int (*sw_init)(struct hl_device *hdev);
11711176
int (*sw_fini)(struct hl_device *hdev);
11721177
int (*hw_init)(struct hl_device *hdev);
1173-
void (*hw_fini)(struct hl_device *hdev, bool hard_reset);
1174-
void (*halt_engines)(struct hl_device *hdev, bool hard_reset);
1178+
void (*hw_fini)(struct hl_device *hdev, bool hard_reset, bool fw_reset);
1179+
void (*halt_engines)(struct hl_device *hdev, bool hard_reset, bool fw_reset);
11751180
int (*suspend)(struct hl_device *hdev);
11761181
int (*resume)(struct hl_device *hdev);
11771182
int (*mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
@@ -2138,11 +2143,13 @@ struct hwmon_chip_info;
21382143
* @wq: work queue for device reset procedure.
21392144
* @reset_work: reset work to be done.
21402145
* @hdev: habanalabs device structure.
2146+
* @fw_reset: whether f/w will do the reset without us sending them a message to do it.
21412147
*/
21422148
struct hl_device_reset_work {
21432149
struct workqueue_struct *wq;
21442150
struct delayed_work reset_work;
21452151
struct hl_device *hdev;
2152+
bool fw_reset;
21462153
};
21472154

21482155
/**

drivers/misc/habanalabs/common/habanalabs_drv.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,7 @@ hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
535535
result = PCI_ERS_RESULT_NONE;
536536
}
537537

538-
hdev->asic_funcs->halt_engines(hdev, true);
538+
hdev->asic_funcs->halt_engines(hdev, true, false);
539539

540540
return result;
541541
}

drivers/misc/habanalabs/gaudi/gaudi.c

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -833,14 +833,14 @@ static int gaudi_early_init(struct hl_device *hdev)
833833
GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC);
834834
if (rc) {
835835
if (hdev->reset_on_preboot_fail)
836-
hdev->asic_funcs->hw_fini(hdev, true);
836+
hdev->asic_funcs->hw_fini(hdev, true, false);
837837
goto pci_fini;
838838
}
839839

840840
if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
841841
dev_info(hdev->dev,
842842
"H/W state is dirty, must reset before initializing\n");
843-
hdev->asic_funcs->hw_fini(hdev, true);
843+
hdev->asic_funcs->hw_fini(hdev, true, false);
844844
}
845845

846846
return 0;
@@ -3836,7 +3836,7 @@ static void gaudi_disable_timestamp(struct hl_device *hdev)
38363836
WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0);
38373837
}
38383838

3839-
static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
3839+
static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset)
38403840
{
38413841
u32 wait_timeout_ms;
38423842

@@ -3848,6 +3848,9 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
38483848
else
38493849
wait_timeout_ms = GAUDI_RESET_WAIT_MSEC;
38503850

3851+
if (fw_reset)
3852+
goto skip_engines;
3853+
38513854
gaudi_stop_nic_qmans(hdev);
38523855
gaudi_stop_mme_qmans(hdev);
38533856
gaudi_stop_tpc_qmans(hdev);
@@ -3873,6 +3876,7 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
38733876

38743877
gaudi_disable_timestamp(hdev);
38753878

3879+
skip_engines:
38763880
gaudi_disable_msi(hdev);
38773881
}
38783882

@@ -4240,7 +4244,7 @@ static int gaudi_hw_init(struct hl_device *hdev)
42404244
return rc;
42414245
}
42424246

4243-
static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
4247+
static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
42444248
{
42454249
struct cpu_dyn_regs *dyn_regs =
42464250
&hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs;
@@ -4261,6 +4265,14 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
42614265
cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC;
42624266
}
42634267

4268+
if (fw_reset) {
4269+
dev_info(hdev->dev,
4270+
"Firmware performs HARD reset, going to wait %dms\n",
4271+
reset_timeout_ms);
4272+
4273+
goto skip_reset;
4274+
}
4275+
42644276
driver_performs_reset = !!(!hdev->asic_prop.fw_security_enabled &&
42654277
!hdev->asic_prop.hard_reset_done_by_fw);
42664278

@@ -4337,6 +4349,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
43374349
reset_timeout_ms);
43384350
}
43394351

4352+
skip_reset:
43404353
/*
43414354
* After hard reset, we can't poll the BTM_FSM register because the PSOC
43424355
* itself is in reset. Need to wait until the reset is deasserted
@@ -7999,10 +8012,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
79998012
tpc_dec_event_to_tpc_id(event_type),
80008013
"AXI_SLV_DEC_Error");
80018014
if (reset_required) {
8002-
dev_err(hdev->dev, "hard reset required due to %s\n",
8015+
dev_err(hdev->dev, "reset required due to %s\n",
80038016
gaudi_irq_map_table[event_type].name);
80048017

8005-
goto reset_device;
8018+
hl_device_reset(hdev, 0);
80068019
} else {
80078020
hl_fw_unmask_irq(hdev, event_type);
80088021
}
@@ -8021,10 +8034,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
80218034
tpc_krn_event_to_tpc_id(event_type),
80228035
"KRN_ERR");
80238036
if (reset_required) {
8024-
dev_err(hdev->dev, "hard reset required due to %s\n",
8037+
dev_err(hdev->dev, "reset required due to %s\n",
80258038
gaudi_irq_map_table[event_type].name);
80268039

8027-
goto reset_device;
8040+
hl_device_reset(hdev, 0);
80288041
} else {
80298042
hl_fw_unmask_irq(hdev, event_type);
80308043
}
@@ -8154,7 +8167,9 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
81548167
return;
81558168

81568169
reset_device:
8157-
if (hdev->hard_reset_on_fw_events)
8170+
if (hdev->asic_prop.fw_security_enabled)
8171+
hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW);
8172+
else if (hdev->hard_reset_on_fw_events)
81588173
hl_device_reset(hdev, HL_RESET_HARD);
81598174
else
81608175
hl_fw_unmask_irq(hdev, event_type);

drivers/misc/habanalabs/goya/goya.c

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -654,14 +654,14 @@ static int goya_early_init(struct hl_device *hdev)
654654
GOYA_BOOT_FIT_REQ_TIMEOUT_USEC);
655655
if (rc) {
656656
if (hdev->reset_on_preboot_fail)
657-
hdev->asic_funcs->hw_fini(hdev, true);
657+
hdev->asic_funcs->hw_fini(hdev, true, false);
658658
goto pci_fini;
659659
}
660660

661661
if (goya_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
662662
dev_info(hdev->dev,
663663
"H/W state is dirty, must reset before initializing\n");
664-
hdev->asic_funcs->hw_fini(hdev, true);
664+
hdev->asic_funcs->hw_fini(hdev, true, false);
665665
}
666666

667667
if (!hdev->pldm) {
@@ -2380,7 +2380,7 @@ static void goya_disable_timestamp(struct hl_device *hdev)
23802380
WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0);
23812381
}
23822382

2383-
static void goya_halt_engines(struct hl_device *hdev, bool hard_reset)
2383+
static void goya_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset)
23842384
{
23852385
u32 wait_timeout_ms;
23862386

@@ -2703,14 +2703,7 @@ static int goya_hw_init(struct hl_device *hdev)
27032703
return rc;
27042704
}
27052705

2706-
/*
2707-
* goya_hw_fini - Goya hardware tear-down code
2708-
*
2709-
* @hdev: pointer to hl_device structure
2710-
* @hard_reset: should we do hard reset to all engines or just reset the
2711-
* compute/dma engines
2712-
*/
2713-
static void goya_hw_fini(struct hl_device *hdev, bool hard_reset)
2706+
static void goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
27142707
{
27152708
struct goya_device *goya = hdev->asic_specific;
27162709
u32 reset_timeout_ms, cpu_timeout_ms, status;

0 commit comments

Comments
 (0)