Skip to content

Commit 838ac90

Browse files
committed
Merge tag 'drm-habanalabs-next-2023-04-10' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into drm-next
This tag contains additional habanalabs driver changes for v6.4: - uAPI changes: - Add a definition of a new Gaudi2 server type. This is used by userspace to know what is the connectivity between the accelerators inside the server - New features and improvements: - speedup h/w queues test in Gaudi2 to reduce device initialization times. - Firmware related fixes: - Fixes to the handshake protocol during f/w initialization. - Sync f/w events interrupt in hard reset to avoid warning message. - Improvements to extraction of the firmware version. - Misc bug fixes and code cleanups. Notable fixes are: - Multiple fixes for interrupt handling in Gaudi2. - Unmap mapped memory in case TLB invalidation fails. Signed-off-by: Daniel Vetter <[email protected]> From: Oded Gabbay <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
2 parents 4d877b1 + 56499c4 commit 838ac90

File tree

17 files changed

+382
-269
lines changed

17 files changed

+382
-269
lines changed

drivers/accel/habanalabs/common/command_buffer.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,20 +45,29 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
4545
}
4646

4747
mutex_lock(&hdev->mmu_lock);
48+
4849
rc = hl_mmu_map_contiguous(ctx, cb->virtual_addr, cb->bus_address, cb->roundup_size);
4950
if (rc) {
5051
dev_err(hdev->dev, "Failed to map VA %#llx to CB\n", cb->virtual_addr);
51-
goto err_va_umap;
52+
goto err_va_pool_free;
5253
}
54+
5355
rc = hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR | MMU_OP_SKIP_LOW_CACHE_INV);
56+
if (rc)
57+
goto err_mmu_unmap;
58+
5459
mutex_unlock(&hdev->mmu_lock);
5560

5661
cb->is_mmu_mapped = true;
57-
return rc;
5862

59-
err_va_umap:
63+
return 0;
64+
65+
err_mmu_unmap:
66+
hl_mmu_unmap_contiguous(ctx, cb->virtual_addr, cb->roundup_size);
67+
err_va_pool_free:
6068
mutex_unlock(&hdev->mmu_lock);
6169
gen_pool_free(ctx->cb_va_pool, cb->virtual_addr, cb->roundup_size);
70+
6271
return rc;
6372
}
6473

drivers/accel/habanalabs/common/decoder.c

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -43,48 +43,46 @@ static void dec_print_abnrm_intr_source(struct hl_device *hdev, u32 irq_status)
4343
intr_source[2], intr_source[3], intr_source[4], intr_source[5]);
4444
}
4545

46-
static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_id)
46+
static void dec_abnrm_intr_work(struct work_struct *work)
4747
{
48+
struct hl_dec *dec = container_of(work, struct hl_dec, abnrm_intr_work);
49+
struct hl_device *hdev = dec->hdev;
50+
u32 irq_status, event_mask = 0;
4851
bool reset_required = false;
49-
u32 irq_status, event_mask;
5052

51-
irq_status = RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET);
53+
irq_status = RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET);
5254

53-
dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, core_id);
55+
dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, dec->core_id);
5456

5557
dec_print_abnrm_intr_source(hdev, irq_status);
5658

5759
/* Clear the interrupt */
58-
WREG32(base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status);
60+
WREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status);
5961

6062
/* Flush the interrupt clear */
61-
RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET);
63+
RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET);
6264

6365
if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) {
6466
reset_required = true;
65-
event_mask = HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
66-
} else if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK) {
67-
event_mask = HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
68-
} else {
69-
event_mask = HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
67+
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
7068
}
7169

70+
if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK)
71+
event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
72+
73+
if (irq_status & (VCMD_IRQ_STATUS_ENDCMD_MASK |
74+
VCMD_IRQ_STATUS_BUSERR_MASK |
75+
VCMD_IRQ_STATUS_ABORT_MASK))
76+
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
77+
7278
if (reset_required) {
7379
event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
7480
hl_device_cond_reset(hdev, 0, event_mask);
75-
} else {
81+
} else if (event_mask) {
7682
hl_notifier_event_send_all(hdev, event_mask);
7783
}
7884
}
7985

80-
static void dec_completion_abnrm(struct work_struct *work)
81-
{
82-
struct hl_dec *dec = container_of(work, struct hl_dec, completion_abnrm_work);
83-
struct hl_device *hdev = dec->hdev;
84-
85-
dec_error_intr_work(hdev, dec->base_addr, dec->core_id);
86-
}
87-
8886
void hl_dec_fini(struct hl_device *hdev)
8987
{
9088
kfree(hdev->dec);
@@ -108,7 +106,7 @@ int hl_dec_init(struct hl_device *hdev)
108106
dec = hdev->dec + j;
109107

110108
dec->hdev = hdev;
111-
INIT_WORK(&dec->completion_abnrm_work, dec_completion_abnrm);
109+
INIT_WORK(&dec->abnrm_intr_work, dec_abnrm_intr_work);
112110
dec->core_id = j;
113111
dec->base_addr = hdev->asic_funcs->get_dec_base_addr(hdev, j);
114112
if (!dec->base_addr) {

drivers/accel/habanalabs/common/device.c

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1271,7 +1271,6 @@ int hl_device_resume(struct hl_device *hdev)
12711271
return 0;
12721272

12731273
disable_device:
1274-
pci_clear_master(hdev->pdev);
12751274
pci_disable_device(hdev->pdev);
12761275

12771276
return rc;
@@ -1381,6 +1380,34 @@ static void device_disable_open_processes(struct hl_device *hdev, bool control_d
13811380
mutex_unlock(fd_lock);
13821381
}
13831382

1383+
static void send_disable_pci_access(struct hl_device *hdev, u32 flags)
1384+
{
1385+
/* If reset is due to heartbeat, device CPU is no responsive in
1386+
* which case no point sending PCI disable message to it.
1387+
*/
1388+
if ((flags & HL_DRV_RESET_HARD) &&
1389+
!(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
1390+
/* Disable PCI access from device F/W so he won't send
1391+
* us additional interrupts. We disable MSI/MSI-X at
1392+
* the halt_engines function and we can't have the F/W
1393+
* sending us interrupts after that. We need to disable
1394+
* the access here because if the device is marked
1395+
* disable, the message won't be send. Also, in case
1396+
* of heartbeat, the device CPU is marked as disable
1397+
* so this message won't be sent
1398+
*/
1399+
if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) {
1400+
dev_warn(hdev->dev, "Failed to disable FW's PCI access\n");
1401+
return;
1402+
}
1403+
1404+
/* verify that last EQs are handled before disabled is set */
1405+
if (hdev->cpu_queues_enable)
1406+
synchronize_irq(pci_irq_vector(hdev->pdev,
1407+
hdev->asic_prop.eq_interrupt_id));
1408+
}
1409+
}
1410+
13841411
static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
13851412
{
13861413
u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
@@ -1419,28 +1446,6 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
14191446
} else {
14201447
hdev->reset_info.reset_trigger_repeated = 1;
14211448
}
1422-
1423-
/* If reset is due to heartbeat, device CPU is no responsive in
1424-
* which case no point sending PCI disable message to it.
1425-
*
1426-
* If F/W is performing the reset, no need to send it a message to disable
1427-
* PCI access
1428-
*/
1429-
if ((flags & HL_DRV_RESET_HARD) &&
1430-
!(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
1431-
/* Disable PCI access from device F/W so he won't send
1432-
* us additional interrupts. We disable MSI/MSI-X at
1433-
* the halt_engines function and we can't have the F/W
1434-
* sending us interrupts after that. We need to disable
1435-
* the access here because if the device is marked
1436-
* disable, the message won't be send. Also, in case
1437-
* of heartbeat, the device CPU is marked as disable
1438-
* so this message won't be sent
1439-
*/
1440-
if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0))
1441-
dev_warn(hdev->dev,
1442-
"Failed to disable FW's PCI access\n");
1443-
}
14441449
}
14451450

14461451
/*
@@ -1561,6 +1566,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
15611566

15621567
escalate_reset_flow:
15631568
handle_reset_trigger(hdev, flags);
1569+
send_disable_pci_access(hdev, flags);
15641570

15651571
/* This also blocks future CS/VM/JOB completion operations */
15661572
hdev->disabled = true;
@@ -1823,9 +1829,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
18231829
dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n");
18241830
flags = hdev->reset_info.hard_reset_schedule_flags;
18251831
hdev->reset_info.hard_reset_schedule_flags = 0;
1826-
hdev->disabled = true;
18271832
hard_reset = true;
1828-
handle_reset_trigger(hdev, flags);
18291833
goto escalate_reset_flow;
18301834
}
18311835
}

drivers/accel/habanalabs/common/firmware_if.c

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ static char *extract_fw_ver_from_str(const char *fw_str)
7171
return NULL;
7272
}
7373

74-
static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver)
74+
static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver)
7575
{
7676
char major[8], minor[8], *first_dot, *second_dot;
7777
int rc;
@@ -86,7 +86,7 @@ static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver)
8686

8787
if (rc) {
8888
dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc);
89-
goto out;
89+
return rc;
9090
}
9191

9292
/* skip the first dot */
@@ -102,9 +102,6 @@ static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver)
102102

103103
if (rc)
104104
dev_err(hdev->dev, "Error %d parsing preboot minor version\n", rc);
105-
106-
out:
107-
kfree(preboot_ver);
108105
return rc;
109106
}
110107

@@ -1263,7 +1260,7 @@ void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev)
12631260
COMMS_RST_DEV, 0, false,
12641261
hdev->fw_loader.cpu_timeout);
12651262
if (rc)
1266-
dev_warn(hdev->dev, "Failed sending COMMS_RST_DEV\n");
1263+
dev_err(hdev->dev, "Failed sending COMMS_RST_DEV\n");
12671264
} else {
12681265
WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_RST_DEV);
12691266
}
@@ -1281,10 +1278,10 @@ void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev)
12811278
/* Stop device CPU to make sure nothing bad happens */
12821279
if (hdev->asic_prop.dynamic_fw_load) {
12831280
rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader,
1284-
COMMS_GOTO_WFE, 0, true,
1281+
COMMS_GOTO_WFE, 0, false,
12851282
hdev->fw_loader.cpu_timeout);
12861283
if (rc)
1287-
dev_warn(hdev->dev, "Failed sending COMMS_GOTO_WFE\n");
1284+
dev_err(hdev->dev, "Failed sending COMMS_GOTO_WFE\n");
12881285
} else {
12891286
WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE);
12901287
msleep(static_loader->cpu_reset_wait_msec);
@@ -2181,8 +2178,8 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev,
21812178

21822179
dev_info(hdev->dev, "preboot version %s\n", preboot_ver);
21832180

2184-
/* This function takes care of freeing preboot_ver */
2185-
rc = extract_fw_sub_versions(hdev, preboot_ver);
2181+
rc = hl_get_preboot_major_minor(hdev, preboot_ver);
2182+
kfree(preboot_ver);
21862183
if (rc)
21872184
return rc;
21882185
}

drivers/accel/habanalabs/common/habanalabs.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,7 @@ struct hl_hints_range {
662662
* @user_interrupt_count: number of user interrupts.
663663
* @user_dec_intr_count: number of decoder interrupts exposed to user.
664664
* @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
665-
* @unexpected_user_error_interrupt_id: interrupt id used to indicate an unexpected user error.
665+
* @eq_interrupt_id: interrupt id for EQ, uses to synchronize EQ interrupts in hard-reset.
666666
* @cache_line_size: device cache line size.
667667
* @server_type: Server type that the ASIC is currently installed in.
668668
* The value is according to enum hl_server_type in uapi file.
@@ -793,7 +793,7 @@ struct asic_fixed_properties {
793793
u16 user_interrupt_count;
794794
u16 user_dec_intr_count;
795795
u16 tpc_interrupt_id;
796-
u16 unexpected_user_error_interrupt_id;
796+
u16 eq_interrupt_id;
797797
u16 cache_line_size;
798798
u16 server_type;
799799
u8 completion_queues_count;
@@ -1211,15 +1211,15 @@ struct hl_eq {
12111211
/**
12121212
* struct hl_dec - describes a decoder sw instance.
12131213
* @hdev: pointer to the device structure.
1214-
* @completion_abnrm_work: workqueue object to run when decoder generates an error interrupt
1214+
* @abnrm_intr_work: workqueue work item to run when decoder generates an error interrupt.
12151215
* @core_id: ID of the decoder.
12161216
* @base_addr: base address of the decoder.
12171217
*/
12181218
struct hl_dec {
1219-
struct hl_device *hdev;
1220-
struct work_struct completion_abnrm_work;
1221-
u32 core_id;
1222-
u32 base_addr;
1219+
struct hl_device *hdev;
1220+
struct work_struct abnrm_intr_work;
1221+
u32 core_id;
1222+
u32 base_addr;
12231223
};
12241224

12251225
/**

drivers/accel/habanalabs/common/irq.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -415,8 +415,8 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg)
415415
struct hl_eq_entry *eq_base;
416416
struct hl_eqe_work *handle_eqe_work;
417417
bool entry_ready;
418-
u32 cur_eqe;
419-
u16 cur_eqe_index;
418+
u32 cur_eqe, ctl;
419+
u16 cur_eqe_index, event_type;
420420

421421
eq_base = eq->kernel_address;
422422

@@ -449,7 +449,10 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg)
449449
dma_rmb();
450450

451451
if (hdev->disabled && !hdev->reset_info.in_compute_reset) {
452-
dev_warn(hdev->dev, "Device disabled but received an EQ event\n");
452+
ctl = le32_to_cpu(eq_entry->hdr.ctl);
453+
event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK) >> EQ_CTL_EVENT_TYPE_SHIFT);
454+
dev_warn(hdev->dev,
455+
"Device disabled but received an EQ event (%u)\n", event_type);
453456
goto skip_irq;
454457
}
455458

@@ -486,7 +489,7 @@ irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg)
486489
{
487490
struct hl_dec *dec = arg;
488491

489-
schedule_work(&dec->completion_abnrm_work);
492+
schedule_work(&dec->abnrm_intr_work);
490493

491494
return IRQ_HANDLED;
492495
}

drivers/accel/habanalabs/common/memory.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,7 @@ static u64 get_va_block(struct hl_device *hdev,
605605
bool is_align_pow_2 = is_power_of_2(va_range->page_size);
606606
bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr);
607607
bool force_hint = flags & HL_MEM_FORCE_HINT;
608+
int rc;
608609

609610
if (is_align_pow_2)
610611
align_mask = ~((u64)va_block_align - 1);
@@ -722,9 +723,13 @@ static u64 get_va_block(struct hl_device *hdev,
722723
kfree(new_va_block);
723724
}
724725

725-
if (add_prev)
726-
add_va_block_locked(hdev, &va_range->list, prev_start,
727-
prev_end);
726+
if (add_prev) {
727+
rc = add_va_block_locked(hdev, &va_range->list, prev_start, prev_end);
728+
if (rc) {
729+
reserved_valid_start = 0;
730+
goto out;
731+
}
732+
}
728733

729734
print_va_list_locked(hdev, &va_range->list);
730735
out:

drivers/accel/habanalabs/common/mmu/mmu.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -679,7 +679,9 @@ int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags)
679679

680680
rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, is_hard, flags);
681681
if (rc)
682-
dev_err_ratelimited(hdev->dev, "MMU cache invalidation failed\n");
682+
dev_err_ratelimited(hdev->dev,
683+
"%s cache invalidation failed, rc=%d\n",
684+
flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU", rc);
683685

684686
return rc;
685687
}
@@ -692,7 +694,9 @@ int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard,
692694
rc = hdev->asic_funcs->mmu_invalidate_cache_range(hdev, is_hard, flags,
693695
asid, va, size);
694696
if (rc)
695-
dev_err_ratelimited(hdev->dev, "MMU cache range invalidation failed\n");
697+
dev_err_ratelimited(hdev->dev,
698+
"%s cache range invalidation failed: va=%#llx, size=%llu, rc=%d",
699+
flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU", va, size, rc);
696700

697701
return rc;
698702
}

drivers/accel/habanalabs/common/pci/pci.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,6 @@ int hl_pci_init(struct hl_device *hdev)
420420
unmap_pci_bars:
421421
hl_pci_bars_unmap(hdev);
422422
disable_device:
423-
pci_clear_master(pdev);
424423
pci_disable_device(pdev);
425424

426425
return rc;
@@ -436,6 +435,5 @@ void hl_pci_fini(struct hl_device *hdev)
436435
{
437436
hl_pci_bars_unmap(hdev);
438437

439-
pci_clear_master(hdev->pdev);
440438
pci_disable_device(hdev->pdev);
441439
}

0 commit comments

Comments
 (0)