@@ -471,3 +471,190 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = {
471
471
.init_registers = nbio_v7_9_init_registers ,
472
472
.get_pcie_replay_count = nbio_v7_9_get_pcie_replay_count ,
473
473
};
474
+
475
+ static void nbio_v7_9_query_ras_error_count (struct amdgpu_device * adev ,
476
+ void * ras_error_status )
477
+ {
478
+ return ;
479
+ }
480
+
481
+ static void nbio_v7_9_handle_ras_controller_intr_no_bifring (struct amdgpu_device * adev )
482
+ {
483
+ uint32_t bif_doorbell_intr_cntl ;
484
+ struct ras_manager * obj = amdgpu_ras_find_obj (adev , adev -> nbio .ras_if );
485
+ struct ras_err_data err_data = {0 , 0 , 0 , NULL };
486
+ struct amdgpu_ras * ras = amdgpu_ras_get_context (adev );
487
+
488
+ bif_doorbell_intr_cntl = RREG32_SOC15 (NBIO , 0 , regBIF_BX0_BIF_DOORBELL_INT_CNTL );
489
+
490
+ if (REG_GET_FIELD (bif_doorbell_intr_cntl ,
491
+ BIF_BX0_BIF_DOORBELL_INT_CNTL , RAS_CNTLR_INTERRUPT_STATUS )) {
492
+ /* driver has to clear the interrupt status when bif ring is disabled */
493
+ bif_doorbell_intr_cntl = REG_SET_FIELD (bif_doorbell_intr_cntl ,
494
+ BIF_BX0_BIF_DOORBELL_INT_CNTL ,
495
+ RAS_CNTLR_INTERRUPT_CLEAR , 1 );
496
+ WREG32_SOC15 (NBIO , 0 , regBIF_BX0_BIF_DOORBELL_INT_CNTL , bif_doorbell_intr_cntl );
497
+
498
+ if (!ras -> disable_ras_err_cnt_harvest ) {
499
+ /*
500
+ * clear error status after ras_controller_intr
501
+ * according to hw team and count ue number
502
+ * for query
503
+ */
504
+ nbio_v7_9_query_ras_error_count (adev , & err_data );
505
+
506
+ /* logging on error cnt and printing for awareness */
507
+ obj -> err_data .ue_count += err_data .ue_count ;
508
+ obj -> err_data .ce_count += err_data .ce_count ;
509
+
510
+ if (err_data .ce_count )
511
+ dev_info (adev -> dev , "%ld correctable hardware "
512
+ "errors detected in %s block, "
513
+ "no user action is needed.\n" ,
514
+ obj -> err_data .ce_count ,
515
+ get_ras_block_str (adev -> nbio .ras_if ));
516
+
517
+ if (err_data .ue_count )
518
+ dev_info (adev -> dev , "%ld uncorrectable hardware "
519
+ "errors detected in %s block\n" ,
520
+ obj -> err_data .ue_count ,
521
+ get_ras_block_str (adev -> nbio .ras_if ));
522
+ }
523
+
524
+ dev_info (adev -> dev , "RAS controller interrupt triggered "
525
+ "by NBIF error\n" );
526
+
527
+ /* ras_controller_int is dedicated for nbif ras error,
528
+ * not the global interrupt for sync flood
529
+ */
530
+ amdgpu_ras_reset_gpu (adev );
531
+ }
532
+ }
533
+
534
+ static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring (struct amdgpu_device * adev )
535
+ {
536
+ uint32_t bif_doorbell_intr_cntl ;
537
+
538
+ bif_doorbell_intr_cntl = RREG32_SOC15 (NBIO , 0 , regBIF_BX0_BIF_DOORBELL_INT_CNTL );
539
+
540
+ if (REG_GET_FIELD (bif_doorbell_intr_cntl ,
541
+ BIF_BX0_BIF_DOORBELL_INT_CNTL , RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS )) {
542
+ /* driver has to clear the interrupt status when bif ring is disabled */
543
+ bif_doorbell_intr_cntl = REG_SET_FIELD (bif_doorbell_intr_cntl ,
544
+ BIF_BX0_BIF_DOORBELL_INT_CNTL ,
545
+ RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR , 1 );
546
+
547
+ WREG32_SOC15 (NBIO , 0 , regBIF_BX0_BIF_DOORBELL_INT_CNTL , bif_doorbell_intr_cntl );
548
+
549
+ amdgpu_ras_global_ras_isr (adev );
550
+ }
551
+ }
552
+
553
+ static int nbio_v7_9_set_ras_controller_irq_state (struct amdgpu_device * adev ,
554
+ struct amdgpu_irq_src * src ,
555
+ unsigned type ,
556
+ enum amdgpu_interrupt_state state )
557
+ {
558
+ /* Dummy function, there is no initialization operation in driver */
559
+
560
+ return 0 ;
561
+ }
562
+
563
+ static int nbio_v7_9_process_ras_controller_irq (struct amdgpu_device * adev ,
564
+ struct amdgpu_irq_src * source ,
565
+ struct amdgpu_iv_entry * entry )
566
+ {
567
+ /* By design, the ih cookie for ras_controller_irq should be written
568
+ * to BIFring instead of general iv ring. However, due to known bif ring
569
+ * hw bug, it has to be disabled. There is no chance the process function
570
+ * will be involked. Just left it as a dummy one.
571
+ */
572
+ return 0 ;
573
+ }
574
+
575
+ static int nbio_v7_9_set_ras_err_event_athub_irq_state (struct amdgpu_device * adev ,
576
+ struct amdgpu_irq_src * src ,
577
+ unsigned type ,
578
+ enum amdgpu_interrupt_state state )
579
+ {
580
+ /* Dummy function, there is no initialization operation in driver */
581
+
582
+ return 0 ;
583
+ }
584
+
585
+ static int nbio_v7_9_process_err_event_athub_irq (struct amdgpu_device * adev ,
586
+ struct amdgpu_irq_src * source ,
587
+ struct amdgpu_iv_entry * entry )
588
+ {
589
+ /* By design, the ih cookie for err_event_athub_irq should be written
590
+ * to BIFring instead of general iv ring. However, due to known bif ring
591
+ * hw bug, it has to be disabled. There is no chance the process function
592
+ * will be involked. Just left it as a dummy one.
593
+ */
594
+ return 0 ;
595
+ }
596
+
597
+ static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_controller_irq_funcs = {
598
+ .set = nbio_v7_9_set_ras_controller_irq_state ,
599
+ .process = nbio_v7_9_process_ras_controller_irq ,
600
+ };
601
+
602
+ static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_err_event_athub_irq_funcs = {
603
+ .set = nbio_v7_9_set_ras_err_event_athub_irq_state ,
604
+ .process = nbio_v7_9_process_err_event_athub_irq ,
605
+ };
606
+
607
+ static int nbio_v7_9_init_ras_controller_interrupt (struct amdgpu_device * adev )
608
+ {
609
+ int r ;
610
+
611
+ /* init the irq funcs */
612
+ adev -> nbio .ras_controller_irq .funcs =
613
+ & nbio_v7_9_ras_controller_irq_funcs ;
614
+ adev -> nbio .ras_controller_irq .num_types = 1 ;
615
+
616
+ /* register ras controller interrupt */
617
+ r = amdgpu_irq_add_id (adev , SOC15_IH_CLIENTID_BIF ,
618
+ NBIF_7_4__SRCID__RAS_CONTROLLER_INTERRUPT ,
619
+ & adev -> nbio .ras_controller_irq );
620
+
621
+ return r ;
622
+ }
623
+
624
+ static int nbio_v7_9_init_ras_err_event_athub_interrupt (struct amdgpu_device * adev )
625
+ {
626
+
627
+ int r ;
628
+
629
+ /* init the irq funcs */
630
+ adev -> nbio .ras_err_event_athub_irq .funcs =
631
+ & nbio_v7_9_ras_err_event_athub_irq_funcs ;
632
+ adev -> nbio .ras_err_event_athub_irq .num_types = 1 ;
633
+
634
+ /* register ras err event athub interrupt */
635
+ r = amdgpu_irq_add_id (adev , SOC15_IH_CLIENTID_BIF ,
636
+ NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT ,
637
+ & adev -> nbio .ras_err_event_athub_irq );
638
+
639
+ return r ;
640
+ }
641
+
642
+ const struct amdgpu_ras_block_hw_ops nbio_v7_9_ras_hw_ops = {
643
+ .query_ras_error_count = nbio_v7_9_query_ras_error_count ,
644
+ };
645
+
646
+ struct amdgpu_nbio_ras nbio_v7_9_ras = {
647
+ .ras_block = {
648
+ .ras_comm = {
649
+ .name = "pcie_bif" ,
650
+ .block = AMDGPU_RAS_BLOCK__PCIE_BIF ,
651
+ .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE ,
652
+ },
653
+ .hw_ops = & nbio_v7_9_ras_hw_ops ,
654
+ .ras_late_init = amdgpu_nbio_ras_late_init ,
655
+ },
656
+ .handle_ras_controller_intr_no_bifring = nbio_v7_9_handle_ras_controller_intr_no_bifring ,
657
+ .handle_ras_err_event_athub_intr_no_bifring = nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring ,
658
+ .init_ras_controller_interrupt = nbio_v7_9_init_ras_controller_interrupt ,
659
+ .init_ras_err_event_athub_interrupt = nbio_v7_9_init_ras_err_event_athub_interrupt ,
660
+ };
0 commit comments