Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions src/components/gaudi2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Gaudi2 Component

The `gaudi2` component provides access to hardware performance counters on Intel Gaudi2 AI Accelerators through the SPMU interface.

- [Environment Variables](#environment-variables)
- [Enabling the Gaudi2 Component](#enabling-the-gaudi2-component)

## Environment Variables
The `gaudi2` component requires setting the `PAPI_GAUDI2_ROOT` environment variable to habanalabs installed directory for `hl-thunk` headers and libraries.

```bash
export PAPI_GAUDI2_ROOT=/usr`
```

## Enabling the Gaudi2 Component

To enable the `gaudi2` component, configure and build PAPI with the component enabled as follows:

```bash
./configure --with-components="gaudi2"
make && make install
```
30 changes: 30 additions & 0 deletions src/components/gaudi2/Rules.gaudi2
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Rules for the Gaudi2 component
# Environment variables (REQUIRED):
# PAPI_GAUDI2_ROOT - Path to Habana installation root
# Example: /usr (for container builds with headers in /usr/include/habanalabs
# and libraries in /usr/lib/habanalabs)
# Expected directory structure under PAPI_GAUDI2_ROOT:
# $(PAPI_GAUDI2_ROOT)/include/habanalabs/hlthunk.h
# $(PAPI_GAUDI2_ROOT)/lib/habanalabs/libhl-thunk.so

# Component source files
COMPSRCS += components/gaudi2/linux-gaudi2.c
COMPOBJS += linux-gaudi2.o

# Default to /usr if not set
PAPI_GAUDI2_ROOT ?= /usr

GAUDI2_INCDIR = $(PAPI_GAUDI2_ROOT)/include/habanalabs
GAUDI2_LIBDIR = $(PAPI_GAUDI2_ROOT)/lib/habanalabs

# Compiler flags
# -I$(GAUDI2_INCDIR) for hlthunk.h and habanalabs_accel.h
CFLAGS += -I$(GAUDI2_INCDIR) -I$(PAPI_GAUDI2_ROOT)/include/drm -g

# Linker flags
# Use dlopen for hlthunk library (no direct linking)
LDFLAGS += $(LDL)

# Compilation rule
linux-gaudi2.o: components/gaudi2/linux-gaudi2.c components/gaudi2/gaudi2_events.h $(HEADERS)
$(CC) $(LIBCFLAGS) $(OPTFLAGS) -c components/gaudi2/linux-gaudi2.c -o linux-gaudi2.o
320 changes: 320 additions & 0 deletions src/components/gaudi2/gaudi2_events.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,320 @@
/**
* @file gaudi2_events.h
*
* @author Tokey Tahmid ttahmid@icl.utk.edu
*/

#ifndef _GAUDI2_EVENTS_H
#define _GAUDI2_EVENTS_H

typedef enum {
GAUDI2_ENGINE_TPC = 0,
GAUDI2_ENGINE_MME,
GAUDI2_ENGINE_EDMA,
GAUDI2_ENGINE_PDMA,
GAUDI2_ENGINE_ROTATOR,
GAUDI2_ENGINE_NIC,
GAUDI2_ENGINE_HBM,
GAUDI2_ENGINE_HMMU,
GAUDI2_ENGINE_MAX
} gaudi2_engine_type_t;

/* Instance counts */
#define GAUDI2_NUM_DCORES 4
#define GAUDI2_TPC_PER_DCORE 6 /* 24 TPCs total */
#define GAUDI2_EDMA_PER_DCORE 2 /* 8 EDMAs total */
#define GAUDI2_MME_PER_DCORE 1 /* 4 MMEs total */
#define GAUDI2_HMMU_PER_DCORE 4 /* 16 HMMUs total */
#define GAUDI2_NUM_PDMA 2
#define GAUDI2_NUM_ROTATOR 2
#define GAUDI2_NUM_NIC 12
#define GAUDI2_NUM_HBM 6
#define GAUDI2_NUM_HBM_MC 2 /* 2 memory controllers per HBM */

#define GAUDI2_MAX_SPMU_COUNTERS 6

/* TPC SPMU events (81 events) */
typedef enum {
TPC_SPMU_MEMORY2SB_BP = 0,
TPC_SPMU_SB2MEMORY_BP = 1,
TPC_SPMU_PQ_NOT_EMPTY_BUT_CQ_EMPTY = 2,
TPC_SPMU_QM_PREFETCH_BUFFER_EMPTY = 3,
TPC_SPMU_SB_2_CORE_BP = 4,
TPC_SPMU_SB_2_CORE_BP_SB_FULL = 5,
TPC_SPMU_SB_2_CORE_BP_SB_MEMORY = 6,
TPC_SPMU_SB_2_CORE_BP_SB_LD_TNSR_FIFO_FULL = 7,
TPC_SPMU_WB2CORE_BP = 8,
TPC_SPMU_STALL_ON_ICACHE_MISS = 9,
TPC_SPMU_STALL_ON_DCACHE_MISS = 10,
TPC_SPMU_STALL_ON_POP_FROM_SB = 11,
TPC_SPMU_STALL_ON_LOOKUP_CACHE_MISS = 12,
TPC_SPMU_STALL_ON_IRQ_FULL = 13,
TPC_SPMU_STALL_ON_MAX_COLORS = 14,
TPC_SPMU_STALL_ON_UARCH_BUBBLE = 15,
TPC_SPMU_STALL_VPU = 16,
TPC_SPMU_STALL_SPU_ANY = 17,
TPC_SPMU_STALL_ON_TSB_FULL = 18,
TPC_SPMU_STALL_ON_ST_L_EXT = 19,
TPC_SPMU_STALL_ON_LD_L_EXT = 20,
TPC_SPMU_STALL = 21,

/* Opcode execution events - configurable via TPC_CFG_OPCODE_EXEC */
TPC_SPMU_NUM_OF_OPCODE1_EXECUTED = 22,
TPC_SPMU_NUM_OF_OPCODE2_EXECUTED = 23,
TPC_SPMU_NUM_OF_OPCODE3_EXECUTED = 24,
TPC_SPMU_NUM_OF_OPCODE4_EXECUTED = 25,

/* Execution events */
TPC_SPMU_KERNEL_EXECUTED = 26,
TPC_SPMU_SCALAR_PIPE_EXEC = 27,
TPC_SPMU_VECTOR_PIPE_EXEC = 28,

/* Cache events */
TPC_SPMU_ICACHE_MISS = 29,
TPC_SPMU_ICACHE_HIT = 30,
TPC_SPMU_KILLED_INSTRUCTION = 31,
TPC_SPMU_LUT_MISS = 32,
TPC_SPMU_DCACHE_MISS = 33,
TPC_SPMU_DCACHE_HIT = 34,

/* Out of bounds events */
TPC_SPMU_OUT_OF_BOUND_DIM0 = 35,
TPC_SPMU_OUT_OF_BOUND_DIM1 = 36,
TPC_SPMU_OUT_OF_BOUND_DIM2 = 37,
TPC_SPMU_OUT_OF_BOUND_DIM3 = 38,
TPC_SPMU_OUT_OF_BOUND_DIM4 = 39,

/* Arithmetic exception events */
TPC_SPMU_DIV_BY_0 = 40,
TPC_SPMU_SPU_MAC_OVERFLOW = 41,
TPC_SPMU_SPU_ADDSUB_OVERFLOW = 42,
TPC_SPMU_SPU_ABS_OVERFLOW = 43,
TPC_SPMU_SPU_FMA_FP_DST_NAN = 44,
TPC_SPMU_SPU_FMA_FP_DST_INF = 45,
TPC_SPMU_SPU_CONVERT_FP_DST_NAN = 46,
TPC_SPMU_SPU_CONVERT_FP_DST_INF = 47,
TPC_SPMU_SPU_FP_DST_DENORM = 48,
TPC_SPMU_VPU_MAC_OVERFLOW = 49,
TPC_SPMU_VPU_ADDSUB_OVERFLOW = 50,
TPC_SPMU_VPU_ABS_OVERFLOW = 51,
TPC_SPMU_VPU_CONVERT_FP_DST_NAN = 52,
TPC_SPMU_VPU_CONVERT_FP_DST_INF = 53,
TPC_SPMU_VPU_FMA_FP_DST_NAN = 54,
TPC_SPMU_VPU_FMA_FP_DST_INF = 55,
TPC_SPMU_VPU_FP_DST_DENORM = 56,

/* Additional events */
TPC_SPMU_STALL_ON_ST_TSNR_FULL = 57,
TPC_SPMU_LUT_HIT = 58,
TPC_SPMU_ADDRESS_EXCEED_VLM = 59,
TPC_SPMU_LD_LOCK_RESEND = 60,
TPC_SPMU_LD_L_PROT_VIO = 61,
TPC_SPMU_ST_L_PROT_VIO = 62,
TPC_SPMU_DCACHE_L0CD_MISMATCH = 63,
TPC_SPMU_TPC_STALL_ON_LD_L_INT = 64,
TPC_SPMU_SB_FIRST_RESPONSE = 65,
TPC_SPMU_SB_LAST_RESPONSE = 66,

/* SB occupancy events */
TPC_SPMU_SB_OCCUPANCY0 = 67,
TPC_SPMU_SB_OCCUPANCY1 = 68,
TPC_SPMU_SB_OCCUPANCY2 = 69,
TPC_SPMU_SB_OCCUPANCY3 = 70,

/* SB CAM events */
TPC_SPMU_SB_DBG_CAM0_MISS = 71,
TPC_SPMU_SB_DBG_CAM0_HIT = 72,
TPC_SPMU_SB_DBG_CAM0_UNCACHEABLE = 73,
TPC_SPMU_SB_DBG_CAM1_MISS = 74,
TPC_SPMU_SB_DBG_CAM1_HIT = 75,
TPC_SPMU_SB_DBG_CAM1_UNCACHEABLE = 76,

/* Additional cache events */
TPC_SPMU_NOC_2_SB_BP = 77,
TPC_SPMU_DCACHE_HW_PREF = 78,
TPC_SPMU_DCACHE_UC = 79,
TPC_SPMU_DCACHE_DEALIGN = 80,

TPC_SPMU_EVENT_MAX = 81
} gaudi2_tpc_spmu_event_t;

/* EDMA SPMU Events (50 events, IDs 0-49) */
typedef enum {
EDMA_SPMU_QMAN0_PQ_BUF_PEND_CNT_EN = 0,
EDMA_SPMU_QMAN1_PQ_BUF_PEND_CNT_EN = 1,
EDMA_SPMU_QMAN2_PQ_BUF_PEND_CNT_EN = 2,
EDMA_SPMU_QMAN3_PQ_BUF_PEND_CNT_EN = 3,
EDMA_SPMU_QMAN0_CQ_BUF_PEND_CNT_EN = 4,
EDMA_SPMU_QMAN1_CQ_BUF_PEND_CNT_EN = 5,
EDMA_SPMU_QMAN2_CQ_BUF_PEND_CNT_EN = 6,
EDMA_SPMU_QMAN3_CQ_BUF_PEND_CNT_EN = 7,
EDMA_SPMU_QMAN_CMDQ_CQ_BUF_PEND_CNT_EN = 8,
EDMA_SPMU_QMAN_CMDQ_ARC_CQ_BUF_PEND_CNT_EN = 9,
EDMA_SPMU_AXI_HBW_ERR = 10,
EDMA_SPMU_AXI_LBW_ERR = 11,
EDMA_SPMU_TRACE_FENCE_START = 12,
EDMA_SPMU_TRACE_FENCE_DONE = 13,
EDMA_SPMU_TRACE_CP_SW_STOP = 14,
EDMA_SPMU_CP_ERR = 15,
EDMA_SPMU_ARB_ERR = 16,
EDMA_SPMU_TRACE_CHOICE_WIN_PUSH = 17,
EDMA_SPMU_DBG_DMA_TRC_DESC_PUSH = 18,
EDMA_SPMU_DBG_DMA_TRC_CPL_MSG_SENT = 19,
EDMA_SPMU_DBG_DMA_TRC_RD_FRST_ADDR_PUSH = 20,
EDMA_SPMU_DBG_DMA_TRC_RD_LAST_ADDR_PUSH = 21,
EDMA_SPMU_DBG_DMA_TRC_WR_FRST_ADDR_PUSH = 22,
EDMA_SPMU_DBG_DMA_TRC_WR_LAST_ADDR_PUSH = 23,
EDMA_SPMU_DBG_DMA_TRC_RD_DATA_FRST = 24,
EDMA_SPMU_DBG_DMA_TRC_RD_DATA_LAST = 25,
EDMA_SPMU_DBG_DMA_TRC_WR_DATA_FRST = 26,
EDMA_SPMU_DBG_DMA_TRC_WR_DATA_LAST = 27,
EDMA_SPMU_DBG_DMA_SPMU_MESH2SB_BP = 28,
EDMA_SPMU_DBG_DMA_SPMU_SB2MESH_BP = 29,
EDMA_SPMU_DBG_DMA_SPMU_MESH2WB_BP = 30,
EDMA_SPMU_DBG_DMA_SPMU_RD_CTX_END2START = 31,
EDMA_SPMU_DBG_DMA_SPMU_WR_CTX_END2START = 32,
EDMA_SPMU_DBG_DMA_SPMU_SB2AGU_BP = 33,
EDMA_SPMU_DBG_DMA_SPMU_SB_FULL_BP = 34,
EDMA_SPMU_DBG_DMA_SPMU_WB2AGU_BP = 35,
EDMA_SPMU_DBG_DMA_SPMU_WB2GSKT_BP = 36,
EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_0 = 37,
EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_1 = 38,
EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_2 = 39,
EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_3 = 40,
EDMA_SPMU_SB_2_INITIATOR_BP_SB_FULL = 41,
EDMA_SPMU_SB_2_INITIATOR_BP = 42,
EDMA_SPMU_SB_DBG_CAM0_MISS = 43,
EDMA_SPMU_SB_DBG_CAM0_HIT = 44,
EDMA_SPMU_SB_DBG_CAM0_UNCACHEABLE = 45,
EDMA_SPMU_SB_DBG_CAM1_MISS = 46,
EDMA_SPMU_SB_DBG_CAM1_HIT = 47,
EDMA_SPMU_SB_DBG_CAM1_UNCACHEABLE = 48,
EDMA_SPMU_SB_AXI_NOC_2_SB_BP = 49,

EDMA_SPMU_EVENT_MAX = 50
} gaudi2_edma_spmu_event_t;

/* MME CTRL SPMU Events (8 events, IDs 0-7) */
typedef enum {
MME_CTRL_SPMU_CONV_END_STALL_DIAG = 0,
MME_CTRL_SPMU_CONV_END_STALL_ACC = 1,
MME_CTRL_SPMU_CONV_END_STALL_DIAG_STALL_ACC = 2,
MME_CTRL_SPMU_OUTER_PRODUCT_STALL_ON_B = 3,
MME_CTRL_SPMU_OUTER_PRODUCT_STALL_ON_A = 4,
MME_CTRL_SPMU_NUM_OUTER_PRODUCTS = 5,
MME_CTRL_SPMU_QM_PREFETCH_BUFFER_EMPTY = 6,
MME_CTRL_SPMU_PQ_NOT_EMPTY_BUT_CQ_EMPTY = 7,

MME_CTRL_SPMU_EVENT_MAX = 8
} gaudi2_mme_ctrl_spmu_event_t;

/* PDMA SPMU Events (50 events, IDs 0-49) */
typedef enum {
PDMA_SPMU_QMAN0_PQ_BUF_PEND_CNT_EN = 0,
PDMA_SPMU_QMAN1_PQ_BUF_PEND_CNT_EN = 1,
PDMA_SPMU_QMAN2_PQ_BUF_PEND_CNT_EN = 2,
PDMA_SPMU_QMAN3_PQ_BUF_PEND_CNT_EN = 3,
PDMA_SPMU_QMAN0_CQ_BUF_PEND_CNT_EN = 4,
PDMA_SPMU_QMAN1_CQ_BUF_PEND_CNT_EN = 5,
PDMA_SPMU_QMAN2_CQ_BUF_PEND_CNT_EN = 6,
PDMA_SPMU_QMAN3_CQ_BUF_PEND_CNT_EN = 7,
PDMA_SPMU_QMAN_CMDQ_CQ_BUF_PEND_CNT_EN = 8,
PDMA_SPMU_QMAN_CMDQ_ARC_CQ_BUF_PEND_CNT_EN = 9,
PDMA_SPMU_AXI_HBW_ERR = 10,
PDMA_SPMU_AXI_LBW_ERR = 11,
PDMA_SPMU_TRACE_FENCE_START = 12,
PDMA_SPMU_TRACE_FENCE_DONE = 13,
PDMA_SPMU_TRACE_CP_SW_STOP = 14,
PDMA_SPMU_CP_ERR = 15,
PDMA_SPMU_ARB_ERR = 16,
PDMA_SPMU_TRACE_CHOICE_WIN_PUSH = 17,
PDMA_SPMU_DBG_DMA_TRC_DESC_PUSH = 18,
PDMA_SPMU_DBG_DMA_TRC_CPL_MSG_SENT = 19,
PDMA_SPMU_DBG_DMA_TRC_RD_FRST_ADDR_PUSH = 20,
PDMA_SPMU_DBG_DMA_TRC_RD_LAST_ADDR_PUSH = 21,
PDMA_SPMU_DBG_DMA_TRC_WR_FRST_ADDR_PUSH = 22,
PDMA_SPMU_DBG_DMA_TRC_WR_LAST_ADDR_PUSH = 23,
PDMA_SPMU_DBG_DMA_TRC_RD_DATA_FRST = 24,
PDMA_SPMU_DBG_DMA_TRC_RD_DATA_LAST = 25,
PDMA_SPMU_DBG_DMA_TRC_WR_DATA_FRST = 26,
PDMA_SPMU_DBG_DMA_TRC_WR_DATA_LAST = 27,
PDMA_SPMU_DBG_DMA_SPMU_MESH2SB_BP = 28,
PDMA_SPMU_DBG_DMA_SPMU_SB2MESH_BP = 29,
PDMA_SPMU_DBG_DMA_SPMU_MESH2WB_BP = 30,
PDMA_SPMU_DBG_DMA_SPMU_RD_CTX_END2START = 31,
PDMA_SPMU_DBG_DMA_SPMU_WR_CTX_END2START = 32,
PDMA_SPMU_DBG_DMA_SPMU_SB2AGU_BP = 33,
PDMA_SPMU_DBG_DMA_SPMU_SB_FULL_BP = 34,
PDMA_SPMU_DBG_DMA_SPMU_WB2AGU_BP = 35,
PDMA_SPMU_DBG_DMA_SPMU_WB2GSKT_BP = 36,
PDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_0 = 37,
PDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_1 = 38,
PDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_2 = 39,
PDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_3 = 40,
PDMA_SPMU_SB_2_INITIATOR_BP_SB_FULL = 41,
PDMA_SPMU_SB_2_INITIATOR_BP = 42,
PDMA_SPMU_SB_DBG_CAM0_MISS = 43,
PDMA_SPMU_SB_DBG_CAM0_HIT = 44,
PDMA_SPMU_SB_DBG_CAM0_UNCACHEABLE = 45,
PDMA_SPMU_SB_DBG_CAM1_MISS = 46,
PDMA_SPMU_SB_DBG_CAM1_HIT = 47,
PDMA_SPMU_SB_DBG_CAM1_UNCACHEABLE = 48,
PDMA_SPMU_SB_AXI_NOC_2_SB_BP = 49,

PDMA_SPMU_EVENT_MAX = 50
} gaudi2_pdma_spmu_event_t;

/* SPMU Base Addresses */

/* TPC SPMU base addresses - pattern: DCORE + TPC offset + SPMU offset */
#define GAUDI2_DCORE0_TPC0_SPMU_BASE 0x1000007FF8001000ULL
#define GAUDI2_DCORE0_TPC1_SPMU_BASE 0x1000007FF8201000ULL
#define GAUDI2_DCORE0_TPC2_SPMU_BASE 0x1000007FF8401000ULL
#define GAUDI2_DCORE0_TPC3_SPMU_BASE 0x1000007FF8601000ULL
#define GAUDI2_DCORE0_TPC4_SPMU_BASE 0x1000007FF8801000ULL
#define GAUDI2_DCORE0_TPC5_SPMU_BASE 0x1000007FF8A01000ULL

#define GAUDI2_DCORE1_TPC0_SPMU_BASE 0x1000007FF9001000ULL
#define GAUDI2_DCORE1_TPC1_SPMU_BASE 0x1000007FF9201000ULL
#define GAUDI2_DCORE1_TPC2_SPMU_BASE 0x1000007FF9401000ULL
#define GAUDI2_DCORE1_TPC3_SPMU_BASE 0x1000007FF9601000ULL
#define GAUDI2_DCORE1_TPC4_SPMU_BASE 0x1000007FF9801000ULL
#define GAUDI2_DCORE1_TPC5_SPMU_BASE 0x1000007FF9A01000ULL

#define GAUDI2_DCORE2_TPC0_SPMU_BASE 0x1000007FFA001000ULL
#define GAUDI2_DCORE2_TPC1_SPMU_BASE 0x1000007FFA201000ULL
#define GAUDI2_DCORE2_TPC2_SPMU_BASE 0x1000007FFA401000ULL
#define GAUDI2_DCORE2_TPC3_SPMU_BASE 0x1000007FFA601000ULL
#define GAUDI2_DCORE2_TPC4_SPMU_BASE 0x1000007FFA801000ULL
#define GAUDI2_DCORE2_TPC5_SPMU_BASE 0x1000007FFAA01000ULL

#define GAUDI2_DCORE3_TPC0_SPMU_BASE 0x1000007FFB001000ULL
#define GAUDI2_DCORE3_TPC1_SPMU_BASE 0x1000007FFB201000ULL
#define GAUDI2_DCORE3_TPC2_SPMU_BASE 0x1000007FFB401000ULL
#define GAUDI2_DCORE3_TPC3_SPMU_BASE 0x1000007FFB601000ULL
#define GAUDI2_DCORE3_TPC4_SPMU_BASE 0x1000007FFB801000ULL
#define GAUDI2_DCORE3_TPC5_SPMU_BASE 0x1000007FFBA01000ULL

/* EDMA SPMU base addresses */
#define GAUDI2_DCORE0_EDMA0_SPMU_BASE 0x1000007FF0001000ULL
#define GAUDI2_DCORE0_EDMA1_SPMU_BASE 0x1000007FF0201000ULL
#define GAUDI2_DCORE1_EDMA0_SPMU_BASE 0x1000007FF1001000ULL
#define GAUDI2_DCORE1_EDMA1_SPMU_BASE 0x1000007FF1201000ULL
#define GAUDI2_DCORE2_EDMA0_SPMU_BASE 0x1000007FF2001000ULL
#define GAUDI2_DCORE2_EDMA1_SPMU_BASE 0x1000007FF2201000ULL
#define GAUDI2_DCORE3_EDMA0_SPMU_BASE 0x1000007FF3001000ULL
#define GAUDI2_DCORE3_EDMA1_SPMU_BASE 0x1000007FF3201000ULL

/* PDMA SPMU base addresses */
#define GAUDI2_PDMA0_SPMU_BASE 0x1000007FFC4A1000ULL
#define GAUDI2_PDMA1_SPMU_BASE 0x1000007FFC4E1000ULL

/* Event Info Structure */
typedef struct {
const char *name; /* Event name (e.g., "TPC_KERNEL_EXECUTED") */
const char *description;
gaudi2_engine_type_t engine; /* Engine type */
unsigned int event_id; /* Hardware event ID within engine */
} gaudi2_native_event_t;

#endif /* _GAUDI2_EVENTS_H */
Loading