Skip to content

Commit 6092ced

Browse files
Tomer Tayarogabbay
authored andcommitted
accel/habanalabs: print qman data on error only for lower qman
By default, the upper QMANs are not used, and instead engines ARCs access the lower QMANs directly. Errors for upper QMANs are therefore not expected, and the debug print of the PQ entries is not needed. Modify the QMAN debug data print on errors to include only information for the lower QMAN. Signed-off-by: Tomer Tayar <[email protected]> Reviewed-by: Oded Gabbay <[email protected]> Signed-off-by: Oded Gabbay <[email protected]>
1 parent 54381ee commit 6092ced

File tree

3 files changed

+31
-128
lines changed

3 files changed

+31
-128
lines changed

drivers/accel/habanalabs/gaudi2/gaudi2.c

Lines changed: 19 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -7744,137 +7744,28 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
77447744
return !!ecc_data->is_critical;
77457745
}
77467746

7747-
/*
7748-
* gaudi2_queue_idx_dec - decrement queue index (pi/ci) and handle wrap
7749-
*
7750-
* @idx: the current pi/ci value
7751-
* @q_len: the queue length (power of 2)
7752-
*
7753-
* @return the cyclically decremented index
7754-
*/
7755-
static inline u32 gaudi2_queue_idx_dec(u32 idx, u32 q_len)
7756-
{
7757-
u32 mask = q_len - 1;
7758-
7759-
/*
7760-
* modular decrement is equivalent to adding (queue_size -1)
7761-
* later we take LSBs to make sure the value is in the
7762-
* range [0, queue_len - 1]
7763-
*/
7764-
return (idx + q_len - 1) & mask;
7765-
}
7766-
7767-
/**
7768-
* gaudi2_print_sw_config_stream_data - print SW config stream data
7769-
*
7770-
* @hdev: pointer to the habanalabs device structure
7771-
* @stream: the QMAN's stream
7772-
* @qman_base: base address of QMAN registers block
7773-
*/
7774-
static void gaudi2_print_sw_config_stream_data(struct hl_device *hdev,
7775-
u32 stream, u64 qman_base)
7747+
static void print_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base)
77767748
{
7777-
u64 cq_ptr_lo, cq_ptr_hi, cq_tsize, cq_ptr;
7778-
u32 cq_ptr_lo_off, size;
7749+
u32 lo, hi, cq_ptr_size, arc_cq_ptr_size;
7750+
u64 cq_ptr, arc_cq_ptr, cp_current_inst;
77797751

7780-
cq_ptr_lo_off = mmDCORE0_TPC0_QM_CQ_PTR_LO_1 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0;
7781-
7782-
cq_ptr_lo = qman_base + (mmDCORE0_TPC0_QM_CQ_PTR_LO_0 - mmDCORE0_TPC0_QM_BASE) +
7783-
stream * cq_ptr_lo_off;
7784-
7785-
cq_ptr_hi = cq_ptr_lo + (mmDCORE0_TPC0_QM_CQ_PTR_HI_0 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0);
7786-
7787-
cq_tsize = cq_ptr_lo + (mmDCORE0_TPC0_QM_CQ_TSIZE_0 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0);
7788-
7789-
cq_ptr = (((u64) RREG32(cq_ptr_hi)) << 32) | RREG32(cq_ptr_lo);
7790-
size = RREG32(cq_tsize);
7791-
dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %x\n",
7792-
stream, cq_ptr, size);
7793-
}
7794-
7795-
/**
7796-
* gaudi2_print_last_pqes_on_err - print last PQEs on error
7797-
*
7798-
* @hdev: pointer to the habanalabs device structure
7799-
* @qid_base: first QID of the QMAN (out of 4 streams)
7800-
* @stream: the QMAN's stream
7801-
* @qman_base: base address of QMAN registers block
7802-
* @pr_sw_conf: if true print the SW config stream data (CQ PTR and SIZE)
7803-
*/
7804-
static void gaudi2_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base, u32 stream,
7805-
u64 qman_base, bool pr_sw_conf)
7806-
{
7807-
u32 ci, qm_ci_stream_off;
7808-
struct hl_hw_queue *q;
7809-
u64 pq_ci;
7810-
int i;
7752+
lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET);
7753+
hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET);
7754+
cq_ptr = ((u64) hi) << 32 | lo;
7755+
cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET);
78117756

7812-
q = &hdev->kernel_queues[qid_base + stream];
7813-
7814-
qm_ci_stream_off = mmDCORE0_TPC0_QM_PQ_CI_1 - mmDCORE0_TPC0_QM_PQ_CI_0;
7815-
pq_ci = qman_base + (mmDCORE0_TPC0_QM_PQ_CI_0 - mmDCORE0_TPC0_QM_BASE) +
7816-
stream * qm_ci_stream_off;
7817-
7818-
hdev->asic_funcs->hw_queues_lock(hdev);
7819-
7820-
if (pr_sw_conf)
7821-
gaudi2_print_sw_config_stream_data(hdev, stream, qman_base);
7822-
7823-
ci = RREG32(pq_ci);
7824-
7825-
/* we should start printing form ci -1 */
7826-
ci = gaudi2_queue_idx_dec(ci, HL_QUEUE_LENGTH);
7827-
7828-
for (i = 0; i < PQ_FETCHER_CACHE_SIZE; i++) {
7829-
struct hl_bd *bd;
7830-
u64 addr;
7831-
u32 len;
7832-
7833-
bd = q->kernel_address;
7834-
bd += ci;
7835-
7836-
len = le32_to_cpu(bd->len);
7837-
/* len 0 means uninitialized entry- break */
7838-
if (!len)
7839-
break;
7840-
7841-
addr = le64_to_cpu(bd->ptr);
7842-
7843-
dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %x\n",
7844-
stream, ci, addr, len);
7845-
7846-
/* get previous ci, wrap if needed */
7847-
ci = gaudi2_queue_idx_dec(ci, HL_QUEUE_LENGTH);
7848-
}
7849-
7850-
hdev->asic_funcs->hw_queues_unlock(hdev);
7851-
}
7852-
7853-
/**
7854-
* print_qman_data_on_err - extract QMAN data on error
7855-
*
7856-
* @hdev: pointer to the habanalabs device structure
7857-
* @qid_base: first QID of the QMAN (out of 4 streams)
7858-
* @stream: the QMAN's stream
7859-
* @qman_base: base address of QMAN registers block
7860-
*
7861-
* This function attempt to extract as much data as possible on QMAN error.
7862-
* On upper CP print the SW config stream data and last 8 PQEs.
7863-
* On lower CP print SW config data and last PQEs of ALL 4 upper CPs
7864-
*/
7865-
static void print_qman_data_on_err(struct hl_device *hdev, u32 qid_base, u32 stream, u64 qman_base)
7866-
{
7867-
u32 i;
7868-
7869-
if (stream != QMAN_STREAMS) {
7870-
gaudi2_print_last_pqes_on_err(hdev, qid_base, stream, qman_base, true);
7871-
return;
7872-
}
7757+
lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET);
7758+
hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET);
7759+
arc_cq_ptr = ((u64) hi) << 32 | lo;
7760+
arc_cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET);
78737761

7874-
gaudi2_print_sw_config_stream_data(hdev, stream, qman_base);
7762+
lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET);
7763+
hi = RREG32(qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET);
7764+
cp_current_inst = ((u64) hi) << 32 | lo;
78757765

7876-
for (i = 0 ; i < QMAN_STREAMS ; i++)
7877-
gaudi2_print_last_pqes_on_err(hdev, qid_base, i, qman_base, false);
7766+
dev_info(hdev->dev,
7767+
"LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n",
7768+
cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, cp_current_inst);
78787769
}
78797770

78807771
static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type,
@@ -7912,7 +7803,8 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type
79127803
error_count++;
79137804
}
79147805

7915-
print_qman_data_on_err(hdev, qid_base, i, qman_base);
7806+
if (i == QMAN_STREAMS)
7807+
print_lower_qman_data_on_err(hdev, qman_base);
79167808
}
79177809

79187810
arb_err_val = RREG32(arb_err_addr);

drivers/accel/habanalabs/gaudi2/gaudi2P.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@
9898
#define GAUDI2_DEFAULT_CARD_NAME "HL225"
9999

100100
#define QMAN_STREAMS 4
101-
#define PQ_FETCHER_CACHE_SIZE 8
101+
102102
#define NUM_OF_MME_SBTE_PORTS 5
103103
#define NUM_OF_MME_WB_PORTS 2
104104

drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,17 @@
242242
#define QM_FENCE2_OFFSET (mmPDMA0_QM_CP_FENCE2_RDATA_0 - mmPDMA0_QM_BASE)
243243
#define QM_SEI_STATUS_OFFSET (mmPDMA0_QM_SEI_STATUS - mmPDMA0_QM_BASE)
244244

245+
#define QM_CQ_PTR_LO_4_OFFSET (mmPDMA0_QM_CQ_PTR_LO_4 - mmPDMA0_QM_BASE)
246+
#define QM_CQ_PTR_HI_4_OFFSET (mmPDMA0_QM_CQ_PTR_HI_4 - mmPDMA0_QM_BASE)
247+
#define QM_CQ_TSIZE_4_OFFSET (mmPDMA0_QM_CQ_TSIZE_4 - mmPDMA0_QM_BASE)
248+
249+
#define QM_ARC_CQ_PTR_LO_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_LO - mmPDMA0_QM_BASE)
250+
#define QM_ARC_CQ_PTR_HI_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_HI - mmPDMA0_QM_BASE)
251+
#define QM_ARC_CQ_TSIZE_OFFSET (mmPDMA0_QM_ARC_CQ_TSIZE - mmPDMA0_QM_BASE)
252+
253+
#define QM_CP_CURRENT_INST_LO_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_LO_4 - mmPDMA0_QM_BASE)
254+
#define QM_CP_CURRENT_INST_HI_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_HI_4 - mmPDMA0_QM_BASE)
255+
245256
#define SFT_OFFSET (mmSFT1_HBW_RTR_IF0_RTR_H3_BASE - mmSFT0_HBW_RTR_IF0_RTR_H3_BASE)
246257
#define SFT_IF_RTR_OFFSET (mmSFT0_HBW_RTR_IF1_RTR_H3_BASE - mmSFT0_HBW_RTR_IF0_RTR_H3_BASE)
247258

0 commit comments

Comments
 (0)