Skip to content

Commit b2d61fe

Browse files
KobyElbazogabbay
authored andcommitted
accel/habanalabs: upon DMA errors, use FW-extracted error cause
Initially, the driver used to read the error cause data directly from the ASIC. However, the FW now clears it before the driver could read it. Therefore we should use the error cause data that is extracted by the FW. Signed-off-by: Koby Elbaz <[email protected]> Reviewed-by: Oded Gabbay <[email protected]> Signed-off-by: Oded Gabbay <[email protected]>
1 parent adda800 commit b2d61fe

File tree

1 file changed

+8
-29
lines changed

1 file changed

+8
-29
lines changed

drivers/accel/habanalabs/gaudi2/gaudi2.c

Lines changed: 8 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -8807,13 +8807,13 @@ static int gaudi2_handle_kdma_core_event(struct hl_device *hdev, u16 event_type,
88078807
return error_count;
88088808
}
88098809

8810-
static int gaudi2_handle_dma_core_event(struct hl_device *hdev, u16 event_type, int sts_addr)
8810+
static int gaudi2_handle_dma_core_event(struct hl_device *hdev, u16 event_type, u64 intr_cause)
88118811
{
8812-
u32 error_count = 0, sts_val = RREG32(sts_addr);
8812+
u32 error_count = 0;
88138813
int i;
88148814

88158815
for (i = 0 ; i < GAUDI2_NUM_OF_DMA_CORE_INTR_CAUSE ; i++)
8816-
if (sts_val & BIT(i)) {
8816+
if (intr_cause & BIT(i)) {
88178817
gaudi2_print_event(hdev, event_type, true,
88188818
"err cause: %s", gaudi2_dma_core_interrupts_cause[i]);
88198819
error_count++;
@@ -8824,27 +8824,6 @@ static int gaudi2_handle_dma_core_event(struct hl_device *hdev, u16 event_type,
88248824
return error_count;
88258825
}
88268826

8827-
static int gaudi2_handle_pdma_core_event(struct hl_device *hdev, u16 event_type, int pdma_idx)
8828-
{
8829-
u32 sts_addr;
8830-
8831-
sts_addr = mmPDMA0_CORE_ERR_CAUSE + pdma_idx * PDMA_OFFSET;
8832-
return gaudi2_handle_dma_core_event(hdev, event_type, sts_addr);
8833-
}
8834-
8835-
static int gaudi2_handle_edma_core_event(struct hl_device *hdev, u16 event_type, int edma_idx)
8836-
{
8837-
static const int edma_event_index_map[] = {2, 3, 0, 1, 6, 7, 4, 5};
8838-
u32 sts_addr, index;
8839-
8840-
index = edma_event_index_map[edma_idx];
8841-
8842-
sts_addr = mmDCORE0_EDMA0_CORE_ERR_CAUSE +
8843-
DCORE_OFFSET * (index / NUM_OF_EDMA_PER_DCORE) +
8844-
DCORE_EDMA_OFFSET * (index % NUM_OF_EDMA_PER_DCORE);
8845-
return gaudi2_handle_dma_core_event(hdev, event_type, sts_addr);
8846-
}
8847-
88488827
static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device *hdev, u64 *event_mask)
88498828
{
88508829
u32 mstr_if_base_addr = mmPCIE_MSTR_RR_MSTR_IF_RR_SHRD_HBW_BASE, razwi_happened_addr;
@@ -9725,19 +9704,19 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
97259704
case GAUDI2_EVENT_KDMA_CH0_AXI_ERR_RSP:
97269705
case GAUDI2_EVENT_KDMA0_CORE:
97279706
error_count = gaudi2_handle_kdma_core_event(hdev, event_type,
9728-
le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
9707+
le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
97299708
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
97309709
break;
97319710

97329711
case GAUDI2_EVENT_HDMA2_CORE ... GAUDI2_EVENT_HDMA5_CORE:
9733-
index = event_type - GAUDI2_EVENT_HDMA2_CORE;
9734-
error_count = gaudi2_handle_edma_core_event(hdev, event_type, index);
9712+
error_count = gaudi2_handle_dma_core_event(hdev, event_type,
9713+
le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
97359714
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
97369715
break;
97379716

97389717
case GAUDI2_EVENT_PDMA0_CORE ... GAUDI2_EVENT_PDMA1_CORE:
9739-
index = event_type - GAUDI2_EVENT_PDMA0_CORE;
9740-
error_count = gaudi2_handle_pdma_core_event(hdev, event_type, index);
9718+
error_count = gaudi2_handle_dma_core_event(hdev, event_type,
9719+
le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
97419720
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
97429721
break;
97439722

0 commit comments

Comments
 (0)