Skip to content

Commit 7692e1e

Browse files
Tao Zhoualexdeucher
authored andcommitted
drm/amdgpu: add RAS fatal error handler for NBIO v7.9
Register RAS fatal error interrupt and add handler. v2: only register NBIO RAS for dGPU platform. change nbio_v7_9_set_ras_controller_irq_state and nbio_v7_9_set_ras_err_event_athub_irq_state to dummy functions. Signed-off-by: Tao Zhou <[email protected]> Reviewed-by: Hawking Zhang <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 657db07 commit 7692e1e

File tree

3 files changed

+193
-0
lines changed

3 files changed

+193
-0
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "amdgpu_xgmi.h"
3636
#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
3737
#include "nbio_v4_3.h"
38+
#include "nbio_v7_9.h"
3839
#include "atom.h"
3940
#include "amdgpu_reset.h"
4041

@@ -2644,6 +2645,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
26442645
* check DF RAS */
26452646
adev->nbio.ras = &nbio_v4_3_ras;
26462647
break;
2648+
case IP_VERSION(7, 9, 0):
2649+
if (!adev->gmc.is_app_apu)
2650+
adev->nbio.ras = &nbio_v7_9_ras;
2651+
break;
26472652
default:
26482653
/* nbio ras is not available */
26492654
break;

drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,3 +471,190 @@ const struct amdgpu_nbio_funcs nbio_v7_9_funcs = {
471471
.init_registers = nbio_v7_9_init_registers,
472472
.get_pcie_replay_count = nbio_v7_9_get_pcie_replay_count,
473473
};
474+
475+
static void nbio_v7_9_query_ras_error_count(struct amdgpu_device *adev,
476+
void *ras_error_status)
477+
{
478+
return;
479+
}
480+
481+
static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device *adev)
482+
{
483+
uint32_t bif_doorbell_intr_cntl;
484+
struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
485+
struct ras_err_data err_data = {0, 0, 0, NULL};
486+
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
487+
488+
bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL);
489+
490+
if (REG_GET_FIELD(bif_doorbell_intr_cntl,
491+
BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_STATUS)) {
492+
/* driver has to clear the interrupt status when bif ring is disabled */
493+
bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
494+
BIF_BX0_BIF_DOORBELL_INT_CNTL,
495+
RAS_CNTLR_INTERRUPT_CLEAR, 1);
496+
WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
497+
498+
if (!ras->disable_ras_err_cnt_harvest) {
499+
/*
500+
* clear error status after ras_controller_intr
501+
* according to hw team and count ue number
502+
* for query
503+
*/
504+
nbio_v7_9_query_ras_error_count(adev, &err_data);
505+
506+
/* logging on error cnt and printing for awareness */
507+
obj->err_data.ue_count += err_data.ue_count;
508+
obj->err_data.ce_count += err_data.ce_count;
509+
510+
if (err_data.ce_count)
511+
dev_info(adev->dev, "%ld correctable hardware "
512+
"errors detected in %s block, "
513+
"no user action is needed.\n",
514+
obj->err_data.ce_count,
515+
get_ras_block_str(adev->nbio.ras_if));
516+
517+
if (err_data.ue_count)
518+
dev_info(adev->dev, "%ld uncorrectable hardware "
519+
"errors detected in %s block\n",
520+
obj->err_data.ue_count,
521+
get_ras_block_str(adev->nbio.ras_if));
522+
}
523+
524+
dev_info(adev->dev, "RAS controller interrupt triggered "
525+
"by NBIF error\n");
526+
527+
/* ras_controller_int is dedicated for nbif ras error,
528+
* not the global interrupt for sync flood
529+
*/
530+
amdgpu_ras_reset_gpu(adev);
531+
}
532+
}
533+
534+
static void nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev)
535+
{
536+
uint32_t bif_doorbell_intr_cntl;
537+
538+
bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL);
539+
540+
if (REG_GET_FIELD(bif_doorbell_intr_cntl,
541+
BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) {
542+
/* driver has to clear the interrupt status when bif ring is disabled */
543+
bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
544+
BIF_BX0_BIF_DOORBELL_INT_CNTL,
545+
RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
546+
547+
WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
548+
549+
amdgpu_ras_global_ras_isr(adev);
550+
}
551+
}
552+
553+
static int nbio_v7_9_set_ras_controller_irq_state(struct amdgpu_device *adev,
554+
struct amdgpu_irq_src *src,
555+
unsigned type,
556+
enum amdgpu_interrupt_state state)
557+
{
558+
/* Dummy function, there is no initialization operation in driver */
559+
560+
return 0;
561+
}
562+
563+
static int nbio_v7_9_process_ras_controller_irq(struct amdgpu_device *adev,
564+
struct amdgpu_irq_src *source,
565+
struct amdgpu_iv_entry *entry)
566+
{
567+
/* By design, the ih cookie for ras_controller_irq should be written
568+
* to BIFring instead of general iv ring. However, due to known bif ring
569+
* hw bug, it has to be disabled. There is no chance the process function
570+
* will be involked. Just left it as a dummy one.
571+
*/
572+
return 0;
573+
}
574+
575+
static int nbio_v7_9_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev,
576+
struct amdgpu_irq_src *src,
577+
unsigned type,
578+
enum amdgpu_interrupt_state state)
579+
{
580+
/* Dummy function, there is no initialization operation in driver */
581+
582+
return 0;
583+
}
584+
585+
static int nbio_v7_9_process_err_event_athub_irq(struct amdgpu_device *adev,
586+
struct amdgpu_irq_src *source,
587+
struct amdgpu_iv_entry *entry)
588+
{
589+
/* By design, the ih cookie for err_event_athub_irq should be written
590+
* to BIFring instead of general iv ring. However, due to known bif ring
591+
* hw bug, it has to be disabled. There is no chance the process function
592+
* will be involked. Just left it as a dummy one.
593+
*/
594+
return 0;
595+
}
596+
597+
static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_controller_irq_funcs = {
598+
.set = nbio_v7_9_set_ras_controller_irq_state,
599+
.process = nbio_v7_9_process_ras_controller_irq,
600+
};
601+
602+
static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_err_event_athub_irq_funcs = {
603+
.set = nbio_v7_9_set_ras_err_event_athub_irq_state,
604+
.process = nbio_v7_9_process_err_event_athub_irq,
605+
};
606+
607+
static int nbio_v7_9_init_ras_controller_interrupt (struct amdgpu_device *adev)
608+
{
609+
int r;
610+
611+
/* init the irq funcs */
612+
adev->nbio.ras_controller_irq.funcs =
613+
&nbio_v7_9_ras_controller_irq_funcs;
614+
adev->nbio.ras_controller_irq.num_types = 1;
615+
616+
/* register ras controller interrupt */
617+
r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF,
618+
NBIF_7_4__SRCID__RAS_CONTROLLER_INTERRUPT,
619+
&adev->nbio.ras_controller_irq);
620+
621+
return r;
622+
}
623+
624+
static int nbio_v7_9_init_ras_err_event_athub_interrupt (struct amdgpu_device *adev)
625+
{
626+
627+
int r;
628+
629+
/* init the irq funcs */
630+
adev->nbio.ras_err_event_athub_irq.funcs =
631+
&nbio_v7_9_ras_err_event_athub_irq_funcs;
632+
adev->nbio.ras_err_event_athub_irq.num_types = 1;
633+
634+
/* register ras err event athub interrupt */
635+
r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF,
636+
NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT,
637+
&adev->nbio.ras_err_event_athub_irq);
638+
639+
return r;
640+
}
641+
642+
const struct amdgpu_ras_block_hw_ops nbio_v7_9_ras_hw_ops = {
643+
.query_ras_error_count = nbio_v7_9_query_ras_error_count,
644+
};
645+
646+
struct amdgpu_nbio_ras nbio_v7_9_ras = {
647+
.ras_block = {
648+
.ras_comm = {
649+
.name = "pcie_bif",
650+
.block = AMDGPU_RAS_BLOCK__PCIE_BIF,
651+
.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
652+
},
653+
.hw_ops = &nbio_v7_9_ras_hw_ops,
654+
.ras_late_init = amdgpu_nbio_ras_late_init,
655+
},
656+
.handle_ras_controller_intr_no_bifring = nbio_v7_9_handle_ras_controller_intr_no_bifring,
657+
.handle_ras_err_event_athub_intr_no_bifring = nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring,
658+
.init_ras_controller_interrupt = nbio_v7_9_init_ras_controller_interrupt,
659+
.init_ras_err_event_athub_interrupt = nbio_v7_9_init_ras_err_event_athub_interrupt,
660+
};

drivers/gpu/drm/amd/amdgpu/nbio_v7_9.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,6 @@
2828

2929
extern const struct nbio_hdp_flush_reg nbio_v7_9_hdp_flush_reg;
3030
extern const struct amdgpu_nbio_funcs nbio_v7_9_funcs;
31+
extern struct amdgpu_nbio_ras nbio_v7_9_ras;
3132

3233
#endif

0 commit comments

Comments
 (0)