@@ -7744,137 +7744,28 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
7744
7744
return !!ecc_data -> is_critical ;
7745
7745
}
7746
7746
7747
- /*
7748
- * gaudi2_queue_idx_dec - decrement queue index (pi/ci) and handle wrap
7749
- *
7750
- * @idx: the current pi/ci value
7751
- * @q_len: the queue length (power of 2)
7752
- *
7753
- * @return the cyclically decremented index
7754
- */
7755
- static inline u32 gaudi2_queue_idx_dec (u32 idx , u32 q_len )
7756
- {
7757
- u32 mask = q_len - 1 ;
7758
-
7759
- /*
7760
- * modular decrement is equivalent to adding (queue_size -1)
7761
- * later we take LSBs to make sure the value is in the
7762
- * range [0, queue_len - 1]
7763
- */
7764
- return (idx + q_len - 1 ) & mask ;
7765
- }
7766
-
7767
- /**
7768
- * gaudi2_print_sw_config_stream_data - print SW config stream data
7769
- *
7770
- * @hdev: pointer to the habanalabs device structure
7771
- * @stream: the QMAN's stream
7772
- * @qman_base: base address of QMAN registers block
7773
- */
7774
- static void gaudi2_print_sw_config_stream_data (struct hl_device * hdev ,
7775
- u32 stream , u64 qman_base )
7747
+ static void print_lower_qman_data_on_err (struct hl_device * hdev , u64 qman_base )
7776
7748
{
7777
- u64 cq_ptr_lo , cq_ptr_hi , cq_tsize , cq_ptr ;
7778
- u32 cq_ptr_lo_off , size ;
7749
+ u32 lo , hi , cq_ptr_size , arc_cq_ptr_size ;
7750
+ u64 cq_ptr , arc_cq_ptr , cp_current_inst ;
7779
7751
7780
- cq_ptr_lo_off = mmDCORE0_TPC0_QM_CQ_PTR_LO_1 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0 ;
7781
-
7782
- cq_ptr_lo = qman_base + (mmDCORE0_TPC0_QM_CQ_PTR_LO_0 - mmDCORE0_TPC0_QM_BASE ) +
7783
- stream * cq_ptr_lo_off ;
7784
-
7785
- cq_ptr_hi = cq_ptr_lo + (mmDCORE0_TPC0_QM_CQ_PTR_HI_0 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0 );
7786
-
7787
- cq_tsize = cq_ptr_lo + (mmDCORE0_TPC0_QM_CQ_TSIZE_0 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0 );
7788
-
7789
- cq_ptr = (((u64 ) RREG32 (cq_ptr_hi )) << 32 ) | RREG32 (cq_ptr_lo );
7790
- size = RREG32 (cq_tsize );
7791
- dev_info (hdev -> dev , "stop on err: stream: %u, addr: %#llx, size: %x\n" ,
7792
- stream , cq_ptr , size );
7793
- }
7794
-
7795
- /**
7796
- * gaudi2_print_last_pqes_on_err - print last PQEs on error
7797
- *
7798
- * @hdev: pointer to the habanalabs device structure
7799
- * @qid_base: first QID of the QMAN (out of 4 streams)
7800
- * @stream: the QMAN's stream
7801
- * @qman_base: base address of QMAN registers block
7802
- * @pr_sw_conf: if true print the SW config stream data (CQ PTR and SIZE)
7803
- */
7804
- static void gaudi2_print_last_pqes_on_err (struct hl_device * hdev , u32 qid_base , u32 stream ,
7805
- u64 qman_base , bool pr_sw_conf )
7806
- {
7807
- u32 ci , qm_ci_stream_off ;
7808
- struct hl_hw_queue * q ;
7809
- u64 pq_ci ;
7810
- int i ;
7752
+ lo = RREG32 (qman_base + QM_CQ_PTR_LO_4_OFFSET );
7753
+ hi = RREG32 (qman_base + QM_CQ_PTR_HI_4_OFFSET );
7754
+ cq_ptr = ((u64 ) hi ) << 32 | lo ;
7755
+ cq_ptr_size = RREG32 (qman_base + QM_CQ_TSIZE_4_OFFSET );
7811
7756
7812
- q = & hdev -> kernel_queues [qid_base + stream ];
7813
-
7814
- qm_ci_stream_off = mmDCORE0_TPC0_QM_PQ_CI_1 - mmDCORE0_TPC0_QM_PQ_CI_0 ;
7815
- pq_ci = qman_base + (mmDCORE0_TPC0_QM_PQ_CI_0 - mmDCORE0_TPC0_QM_BASE ) +
7816
- stream * qm_ci_stream_off ;
7817
-
7818
- hdev -> asic_funcs -> hw_queues_lock (hdev );
7819
-
7820
- if (pr_sw_conf )
7821
- gaudi2_print_sw_config_stream_data (hdev , stream , qman_base );
7822
-
7823
- ci = RREG32 (pq_ci );
7824
-
7825
- /* we should start printing form ci -1 */
7826
- ci = gaudi2_queue_idx_dec (ci , HL_QUEUE_LENGTH );
7827
-
7828
- for (i = 0 ; i < PQ_FETCHER_CACHE_SIZE ; i ++ ) {
7829
- struct hl_bd * bd ;
7830
- u64 addr ;
7831
- u32 len ;
7832
-
7833
- bd = q -> kernel_address ;
7834
- bd += ci ;
7835
-
7836
- len = le32_to_cpu (bd -> len );
7837
- /* len 0 means uninitialized entry- break */
7838
- if (!len )
7839
- break ;
7840
-
7841
- addr = le64_to_cpu (bd -> ptr );
7842
-
7843
- dev_info (hdev -> dev , "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %x\n" ,
7844
- stream , ci , addr , len );
7845
-
7846
- /* get previous ci, wrap if needed */
7847
- ci = gaudi2_queue_idx_dec (ci , HL_QUEUE_LENGTH );
7848
- }
7849
-
7850
- hdev -> asic_funcs -> hw_queues_unlock (hdev );
7851
- }
7852
-
7853
- /**
7854
- * print_qman_data_on_err - extract QMAN data on error
7855
- *
7856
- * @hdev: pointer to the habanalabs device structure
7857
- * @qid_base: first QID of the QMAN (out of 4 streams)
7858
- * @stream: the QMAN's stream
7859
- * @qman_base: base address of QMAN registers block
7860
- *
7861
- * This function attempt to extract as much data as possible on QMAN error.
7862
- * On upper CP print the SW config stream data and last 8 PQEs.
7863
- * On lower CP print SW config data and last PQEs of ALL 4 upper CPs
7864
- */
7865
- static void print_qman_data_on_err (struct hl_device * hdev , u32 qid_base , u32 stream , u64 qman_base )
7866
- {
7867
- u32 i ;
7868
-
7869
- if (stream != QMAN_STREAMS ) {
7870
- gaudi2_print_last_pqes_on_err (hdev , qid_base , stream , qman_base , true);
7871
- return ;
7872
- }
7757
+ lo = RREG32 (qman_base + QM_ARC_CQ_PTR_LO_OFFSET );
7758
+ hi = RREG32 (qman_base + QM_ARC_CQ_PTR_HI_OFFSET );
7759
+ arc_cq_ptr = ((u64 ) hi ) << 32 | lo ;
7760
+ arc_cq_ptr_size = RREG32 (qman_base + QM_ARC_CQ_TSIZE_OFFSET );
7873
7761
7874
- gaudi2_print_sw_config_stream_data (hdev , stream , qman_base );
7762
+ lo = RREG32 (qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET );
7763
+ hi = RREG32 (qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET );
7764
+ cp_current_inst = ((u64 ) hi ) << 32 | lo ;
7875
7765
7876
- for (i = 0 ; i < QMAN_STREAMS ; i ++ )
7877
- gaudi2_print_last_pqes_on_err (hdev , qid_base , i , qman_base , false);
7766
+ dev_info (hdev -> dev ,
7767
+ "LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n" ,
7768
+ cq_ptr , cq_ptr_size , arc_cq_ptr , arc_cq_ptr_size , cp_current_inst );
7878
7769
}
7879
7770
7880
7771
static int gaudi2_handle_qman_err_generic (struct hl_device * hdev , u16 event_type ,
@@ -7912,7 +7803,8 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type
7912
7803
error_count ++ ;
7913
7804
}
7914
7805
7915
- print_qman_data_on_err (hdev , qid_base , i , qman_base );
7806
+ if (i == QMAN_STREAMS )
7807
+ print_lower_qman_data_on_err (hdev , qman_base );
7916
7808
}
7917
7809
7918
7810
arb_err_val = RREG32 (arb_err_addr );
0 commit comments