Skip to content

Commit b757f0f

Browse files
committed
Add gaudi2 component for Intel Habana Gaudi2 AI accelerator
1 parent ddaf719 commit b757f0f

File tree

4 files changed

+1129
-0
lines changed

4 files changed

+1129
-0
lines changed

src/components/gaudi2/README.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# PAPI Gaudi2 Component
2+
3+
This PAPI component provides access to hardware performance counters on Intel Gaudi2 AI Accelerators through the SPMU interface.
4+
5+
## Overview
6+
7+
The Gaudi2 component enables monitoring of:
8+
- **TPC (Tensor Processing Core)** - 24 TPCs across 4 DCOREs
9+
- **EDMA (External DMA)** - 8 EDMAs for data movement
10+
- **PDMA (PCIe DMA)** - 2 PDMAs for host-device transfers
11+
- **MME (Matrix Multiplication Engine)** - 4 MMEs for matrix operations
12+
13+
Each SPMU unit supports up to 6 programmable counters that can be configured to count various hardware events.
14+
15+
## Requirements
16+
17+
- **Hardware**: Intel Gaudi2 AI Accelerator
18+
- **Software**:
19+
- Habana Labs driver and runtime
20+
- libhl-thunk.so (Habana thunk library)
21+
- Access to `/dev/accel/accel*` devices
22+
- **Permissions**: User must have read/write access to accelerator devices
23+
24+
## Building
25+
26+
Set the `PAPI_GAUDI2_ROOT` environment variable to habanalabs installed directory for `hl-thunk` headers and libraries.
27+
`export PAPI_GAUDI2_ROOT=/usr`
28+
29+
Configure the component using:
30+
`./configure --with-components="gaudi2"`
31+
32+
then build with:
33+
`make && make install`

src/components/gaudi2/Rules.gaudi2

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Rules for the Gaudi2 component
2+
#
3+
# This component provides access to Intel Gaudi2 AI Accelerator
4+
# hardware performance counters (SPMU).
5+
#
6+
# Requirements:
7+
# - libhl-thunk.so (Habana Labs thunk library)
8+
# - /dev/accel/accel* device access
9+
#
10+
# Build with:
11+
# ./configure --with-components="gaudi2"
12+
#
13+
# Environment variables (REQUIRED):
14+
# PAPI_GAUDI2_ROOT - Path to Habana installation root
15+
# Example: /usr (for container builds with headers in /usr/include/habanalabs
16+
# and libraries in /usr/lib/habanalabs)
17+
#
18+
# Expected directory structure under PAPI_GAUDI2_ROOT:
19+
# $(PAPI_GAUDI2_ROOT)/include/habanalabs/hlthunk.h
20+
# $(PAPI_GAUDI2_ROOT)/lib/habanalabs/libhl-thunk.so
21+
#
22+
23+
# Component source files
24+
COMPSRCS += components/gaudi2/linux-gaudi2.c
25+
COMPOBJS += linux-gaudi2.o
26+
27+
# Require PAPI_GAUDI2_ROOT to be set
28+
ifndef PAPI_GAUDI2_ROOT
29+
$(error PAPI_GAUDI2_ROOT must be set. Example: export PAPI_GAUDI2_ROOT=/usr)
30+
endif
31+
32+
GAUDI2_INCDIR = $(PAPI_GAUDI2_ROOT)/include/habanalabs
33+
GAUDI2_LIBDIR = $(PAPI_GAUDI2_ROOT)/lib/habanalabs
34+
35+
# Compiler flags
36+
# -I$(GAUDI2_INCDIR) for hlthunk.h and habanalabs_accel.h
37+
CFLAGS += -I$(GAUDI2_INCDIR) -g
38+
39+
# Linker flags
40+
# Use dlopen for hlthunk library (no direct linking)
41+
LDFLAGS += $(LDL)
42+
43+
# Compilation rule
44+
linux-gaudi2.o: components/gaudi2/linux-gaudi2.c components/gaudi2/gaudi2_events.h $(HEADERS)
45+
$(CC) $(LIBCFLAGS) $(OPTFLAGS) -c components/gaudi2/linux-gaudi2.c -o linux-gaudi2.o
Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
/*
2+
* gaudi2_events.h - Gaudi2 SPMU event definitions
3+
*
4+
* Event IDs from hl-prof-config -c gaudi2 --spmu-help
5+
* SPMU base addresses from gaudi2_blocks.h
6+
*/
7+
8+
#ifndef _GAUDI2_EVENTS_H
9+
#define _GAUDI2_EVENTS_H
10+
11+
typedef enum {
12+
GAUDI2_ENGINE_TPC = 0,
13+
GAUDI2_ENGINE_MME,
14+
GAUDI2_ENGINE_EDMA,
15+
GAUDI2_ENGINE_PDMA,
16+
GAUDI2_ENGINE_ROTATOR,
17+
GAUDI2_ENGINE_NIC,
18+
GAUDI2_ENGINE_HBM,
19+
GAUDI2_ENGINE_HMMU,
20+
GAUDI2_ENGINE_MAX
21+
} gaudi2_engine_type_t;
22+
23+
/* Instance counts */
24+
#define GAUDI2_NUM_DCORES 4
25+
#define GAUDI2_TPC_PER_DCORE 6 /* 24 TPCs total */
26+
#define GAUDI2_EDMA_PER_DCORE 2 /* 8 EDMAs total */
27+
#define GAUDI2_MME_PER_DCORE 1 /* 4 MMEs total */
28+
#define GAUDI2_HMMU_PER_DCORE 4 /* 16 HMMUs total */
29+
#define GAUDI2_NUM_PDMA 2
30+
#define GAUDI2_NUM_ROTATOR 2
31+
#define GAUDI2_NUM_NIC 12
32+
#define GAUDI2_NUM_HBM 6
33+
#define GAUDI2_NUM_HBM_MC 2 /* 2 memory controllers per HBM */
34+
35+
#define GAUDI2_MAX_SPMU_COUNTERS 6
36+
37+
/* TPC SPMU events (81 events) */
38+
typedef enum {
39+
TPC_SPMU_MEMORY2SB_BP = 0,
40+
TPC_SPMU_SB2MEMORY_BP = 1,
41+
TPC_SPMU_PQ_NOT_EMPTY_BUT_CQ_EMPTY = 2,
42+
TPC_SPMU_QM_PREFETCH_BUFFER_EMPTY = 3,
43+
TPC_SPMU_SB_2_CORE_BP = 4,
44+
TPC_SPMU_SB_2_CORE_BP_SB_FULL = 5,
45+
TPC_SPMU_SB_2_CORE_BP_SB_MEMORY = 6,
46+
TPC_SPMU_SB_2_CORE_BP_SB_LD_TNSR_FIFO_FULL = 7,
47+
TPC_SPMU_WB2CORE_BP = 8,
48+
TPC_SPMU_STALL_ON_ICACHE_MISS = 9,
49+
TPC_SPMU_STALL_ON_DCACHE_MISS = 10,
50+
TPC_SPMU_STALL_ON_POP_FROM_SB = 11,
51+
TPC_SPMU_STALL_ON_LOOKUP_CACHE_MISS = 12,
52+
TPC_SPMU_STALL_ON_IRQ_FULL = 13,
53+
TPC_SPMU_STALL_ON_MAX_COLORS = 14,
54+
TPC_SPMU_STALL_ON_UARCH_BUBBLE = 15,
55+
TPC_SPMU_STALL_VPU = 16,
56+
TPC_SPMU_STALL_SPU_ANY = 17,
57+
TPC_SPMU_STALL_ON_TSB_FULL = 18,
58+
TPC_SPMU_STALL_ON_ST_L_EXT = 19,
59+
TPC_SPMU_STALL_ON_LD_L_EXT = 20,
60+
TPC_SPMU_STALL = 21,
61+
62+
/* Opcode execution events - configurable via TPC_CFG_OPCODE_EXEC */
63+
TPC_SPMU_NUM_OF_OPCODE1_EXECUTED = 22,
64+
TPC_SPMU_NUM_OF_OPCODE2_EXECUTED = 23,
65+
TPC_SPMU_NUM_OF_OPCODE3_EXECUTED = 24,
66+
TPC_SPMU_NUM_OF_OPCODE4_EXECUTED = 25,
67+
68+
/* Execution events */
69+
TPC_SPMU_KERNEL_EXECUTED = 26,
70+
TPC_SPMU_SCALAR_PIPE_EXEC = 27,
71+
TPC_SPMU_VECTOR_PIPE_EXEC = 28,
72+
73+
/* Cache events */
74+
TPC_SPMU_ICACHE_MISS = 29,
75+
TPC_SPMU_ICACHE_HIT = 30,
76+
TPC_SPMU_KILLED_INSTRUCTION = 31,
77+
TPC_SPMU_LUT_MISS = 32,
78+
TPC_SPMU_DCACHE_MISS = 33,
79+
TPC_SPMU_DCACHE_HIT = 34,
80+
81+
/* Out of bounds events */
82+
TPC_SPMU_OUT_OF_BOUND_DIM0 = 35,
83+
TPC_SPMU_OUT_OF_BOUND_DIM1 = 36,
84+
TPC_SPMU_OUT_OF_BOUND_DIM2 = 37,
85+
TPC_SPMU_OUT_OF_BOUND_DIM3 = 38,
86+
TPC_SPMU_OUT_OF_BOUND_DIM4 = 39,
87+
88+
/* Arithmetic exception events */
89+
TPC_SPMU_DIV_BY_0 = 40,
90+
TPC_SPMU_SPU_MAC_OVERFLOW = 41,
91+
TPC_SPMU_SPU_ADDSUB_OVERFLOW = 42,
92+
TPC_SPMU_SPU_ABS_OVERFLOW = 43,
93+
TPC_SPMU_SPU_FMA_FP_DST_NAN = 44,
94+
TPC_SPMU_SPU_FMA_FP_DST_INF = 45,
95+
TPC_SPMU_SPU_CONVERT_FP_DST_NAN = 46,
96+
TPC_SPMU_SPU_CONVERT_FP_DST_INF = 47,
97+
TPC_SPMU_SPU_FP_DST_DENORM = 48,
98+
TPC_SPMU_VPU_MAC_OVERFLOW = 49,
99+
TPC_SPMU_VPU_ADDSUB_OVERFLOW = 50,
100+
TPC_SPMU_VPU_ABS_OVERFLOW = 51,
101+
TPC_SPMU_VPU_CONVERT_FP_DST_NAN = 52,
102+
TPC_SPMU_VPU_CONVERT_FP_DST_INF = 53,
103+
TPC_SPMU_VPU_FMA_FP_DST_NAN = 54,
104+
TPC_SPMU_VPU_FMA_FP_DST_INF = 55,
105+
TPC_SPMU_VPU_FP_DST_DENORM = 56,
106+
107+
/* Additional events */
108+
TPC_SPMU_STALL_ON_ST_TSNR_FULL = 57,
109+
TPC_SPMU_LUT_HIT = 58,
110+
TPC_SPMU_ADDRESS_EXCEED_VLM = 59,
111+
TPC_SPMU_LD_LOCK_RESEND = 60,
112+
TPC_SPMU_LD_L_PROT_VIO = 61,
113+
TPC_SPMU_ST_L_PROT_VIO = 62,
114+
TPC_SPMU_DCACHE_L0CD_MISMATCH = 63,
115+
TPC_SPMU_TPC_STALL_ON_LD_L_INT = 64,
116+
TPC_SPMU_SB_FIRST_RESPONSE = 65,
117+
TPC_SPMU_SB_LAST_RESPONSE = 66,
118+
119+
/* SB occupancy events */
120+
TPC_SPMU_SB_OCCUPANCY0 = 67,
121+
TPC_SPMU_SB_OCCUPANCY1 = 68,
122+
TPC_SPMU_SB_OCCUPANCY2 = 69,
123+
TPC_SPMU_SB_OCCUPANCY3 = 70,
124+
125+
/* SB CAM events */
126+
TPC_SPMU_SB_DBG_CAM0_MISS = 71,
127+
TPC_SPMU_SB_DBG_CAM0_HIT = 72,
128+
TPC_SPMU_SB_DBG_CAM0_UNCACHEABLE = 73,
129+
TPC_SPMU_SB_DBG_CAM1_MISS = 74,
130+
TPC_SPMU_SB_DBG_CAM1_HIT = 75,
131+
TPC_SPMU_SB_DBG_CAM1_UNCACHEABLE = 76,
132+
133+
/* Additional cache events */
134+
TPC_SPMU_NOC_2_SB_BP = 77,
135+
TPC_SPMU_DCACHE_HW_PREF = 78,
136+
TPC_SPMU_DCACHE_UC = 79,
137+
TPC_SPMU_DCACHE_DEALIGN = 80,
138+
139+
TPC_SPMU_EVENT_MAX = 81
140+
} gaudi2_tpc_spmu_event_t;
141+
142+
/* EDMA SPMU Events (50 events, IDs 0-49) */
143+
typedef enum {
144+
EDMA_SPMU_QMAN0_PQ_BUF_PEND_CNT_EN = 0,
145+
EDMA_SPMU_QMAN1_PQ_BUF_PEND_CNT_EN = 1,
146+
EDMA_SPMU_QMAN2_PQ_BUF_PEND_CNT_EN = 2,
147+
EDMA_SPMU_QMAN3_PQ_BUF_PEND_CNT_EN = 3,
148+
EDMA_SPMU_QMAN0_CQ_BUF_PEND_CNT_EN = 4,
149+
EDMA_SPMU_QMAN1_CQ_BUF_PEND_CNT_EN = 5,
150+
EDMA_SPMU_QMAN2_CQ_BUF_PEND_CNT_EN = 6,
151+
EDMA_SPMU_QMAN3_CQ_BUF_PEND_CNT_EN = 7,
152+
EDMA_SPMU_QMAN_CMDQ_CQ_BUF_PEND_CNT_EN = 8,
153+
EDMA_SPMU_QMAN_CMDQ_ARC_CQ_BUF_PEND_CNT_EN = 9,
154+
EDMA_SPMU_AXI_HBW_ERR = 10,
155+
EDMA_SPMU_AXI_LBW_ERR = 11,
156+
EDMA_SPMU_TRACE_FENCE_START = 12,
157+
EDMA_SPMU_TRACE_FENCE_DONE = 13,
158+
EDMA_SPMU_TRACE_CP_SW_STOP = 14,
159+
EDMA_SPMU_CP_ERR = 15,
160+
EDMA_SPMU_ARB_ERR = 16,
161+
EDMA_SPMU_TRACE_CHOICE_WIN_PUSH = 17,
162+
EDMA_SPMU_DBG_DMA_TRC_DESC_PUSH = 18,
163+
EDMA_SPMU_DBG_DMA_TRC_CPL_MSG_SENT = 19,
164+
EDMA_SPMU_DBG_DMA_TRC_RD_FRST_ADDR_PUSH = 20,
165+
EDMA_SPMU_DBG_DMA_TRC_RD_LAST_ADDR_PUSH = 21,
166+
EDMA_SPMU_DBG_DMA_TRC_WR_FRST_ADDR_PUSH = 22,
167+
EDMA_SPMU_DBG_DMA_TRC_WR_LAST_ADDR_PUSH = 23,
168+
EDMA_SPMU_DBG_DMA_TRC_RD_DATA_FRST = 24,
169+
EDMA_SPMU_DBG_DMA_TRC_RD_DATA_LAST = 25,
170+
EDMA_SPMU_DBG_DMA_TRC_WR_DATA_FRST = 26,
171+
EDMA_SPMU_DBG_DMA_TRC_WR_DATA_LAST = 27,
172+
EDMA_SPMU_DBG_DMA_SPMU_MESH2SB_BP = 28,
173+
EDMA_SPMU_DBG_DMA_SPMU_SB2MESH_BP = 29,
174+
EDMA_SPMU_DBG_DMA_SPMU_MESH2WB_BP = 30,
175+
EDMA_SPMU_DBG_DMA_SPMU_RD_CTX_END2START = 31,
176+
EDMA_SPMU_DBG_DMA_SPMU_WR_CTX_END2START = 32,
177+
EDMA_SPMU_DBG_DMA_SPMU_SB2AGU_BP = 33,
178+
EDMA_SPMU_DBG_DMA_SPMU_SB_FULL_BP = 34,
179+
EDMA_SPMU_DBG_DMA_SPMU_WB2AGU_BP = 35,
180+
EDMA_SPMU_DBG_DMA_SPMU_WB2GSKT_BP = 36,
181+
EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_0 = 37,
182+
EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_1 = 38,
183+
EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_2 = 39,
184+
EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_3 = 40,
185+
EDMA_SPMU_SB_2_INITIATOR_BP_SB_FULL = 41,
186+
EDMA_SPMU_SB_2_INITIATOR_BP = 42,
187+
EDMA_SPMU_SB_DBG_CAM0_MISS = 43,
188+
EDMA_SPMU_SB_DBG_CAM0_HIT = 44,
189+
EDMA_SPMU_SB_DBG_CAM0_UNCACHEABLE = 45,
190+
EDMA_SPMU_SB_DBG_CAM1_MISS = 46,
191+
EDMA_SPMU_SB_DBG_CAM1_HIT = 47,
192+
EDMA_SPMU_SB_DBG_CAM1_UNCACHEABLE = 48,
193+
EDMA_SPMU_SB_AXI_NOC_2_SB_BP = 49,
194+
195+
EDMA_SPMU_EVENT_MAX = 50
196+
} gaudi2_edma_spmu_event_t;
197+
198+
/* MME CTRL SPMU Events (8 events, IDs 0-7) */
199+
typedef enum {
200+
MME_CTRL_SPMU_CONV_END_STALL_DIAG = 0,
201+
MME_CTRL_SPMU_CONV_END_STALL_ACC = 1,
202+
MME_CTRL_SPMU_CONV_END_STALL_DIAG_STALL_ACC = 2,
203+
MME_CTRL_SPMU_OUTER_PRODUCT_STALL_ON_B = 3,
204+
MME_CTRL_SPMU_OUTER_PRODUCT_STALL_ON_A = 4,
205+
MME_CTRL_SPMU_NUM_OUTER_PRODUCTS = 5,
206+
MME_CTRL_SPMU_QM_PREFETCH_BUFFER_EMPTY = 6,
207+
MME_CTRL_SPMU_PQ_NOT_EMPTY_BUT_CQ_EMPTY = 7,
208+
209+
MME_CTRL_SPMU_EVENT_MAX = 8
210+
} gaudi2_mme_ctrl_spmu_event_t;
211+
212+
/* SPMU Base Addresses */
213+
214+
/* TPC SPMU base addresses - pattern: DCORE + TPC offset + SPMU offset */
215+
#define GAUDI2_DCORE0_TPC0_SPMU_BASE 0x1000007FF8001000ULL
216+
#define GAUDI2_DCORE0_TPC1_SPMU_BASE 0x1000007FF8201000ULL
217+
#define GAUDI2_DCORE0_TPC2_SPMU_BASE 0x1000007FF8401000ULL
218+
#define GAUDI2_DCORE0_TPC3_SPMU_BASE 0x1000007FF8601000ULL
219+
#define GAUDI2_DCORE0_TPC4_SPMU_BASE 0x1000007FF8801000ULL
220+
#define GAUDI2_DCORE0_TPC5_SPMU_BASE 0x1000007FF8A01000ULL
221+
222+
#define GAUDI2_DCORE1_TPC0_SPMU_BASE 0x1000007FF9001000ULL
223+
#define GAUDI2_DCORE1_TPC1_SPMU_BASE 0x1000007FF9201000ULL
224+
#define GAUDI2_DCORE1_TPC2_SPMU_BASE 0x1000007FF9401000ULL
225+
#define GAUDI2_DCORE1_TPC3_SPMU_BASE 0x1000007FF9601000ULL
226+
#define GAUDI2_DCORE1_TPC4_SPMU_BASE 0x1000007FF9801000ULL
227+
#define GAUDI2_DCORE1_TPC5_SPMU_BASE 0x1000007FF9A01000ULL
228+
229+
#define GAUDI2_DCORE2_TPC0_SPMU_BASE 0x1000007FFA001000ULL
230+
#define GAUDI2_DCORE2_TPC1_SPMU_BASE 0x1000007FFA201000ULL
231+
#define GAUDI2_DCORE2_TPC2_SPMU_BASE 0x1000007FFA401000ULL
232+
#define GAUDI2_DCORE2_TPC3_SPMU_BASE 0x1000007FFA601000ULL
233+
#define GAUDI2_DCORE2_TPC4_SPMU_BASE 0x1000007FFA801000ULL
234+
#define GAUDI2_DCORE2_TPC5_SPMU_BASE 0x1000007FFAA01000ULL
235+
236+
#define GAUDI2_DCORE3_TPC0_SPMU_BASE 0x1000007FFB001000ULL
237+
#define GAUDI2_DCORE3_TPC1_SPMU_BASE 0x1000007FFB201000ULL
238+
#define GAUDI2_DCORE3_TPC2_SPMU_BASE 0x1000007FFB401000ULL
239+
#define GAUDI2_DCORE3_TPC3_SPMU_BASE 0x1000007FFB601000ULL
240+
#define GAUDI2_DCORE3_TPC4_SPMU_BASE 0x1000007FFB801000ULL
241+
#define GAUDI2_DCORE3_TPC5_SPMU_BASE 0x1000007FFBA01000ULL
242+
243+
/* EDMA SPMU base addresses */
244+
#define GAUDI2_DCORE0_EDMA0_SPMU_BASE 0x1000007FF0001000ULL
245+
#define GAUDI2_DCORE0_EDMA1_SPMU_BASE 0x1000007FF0201000ULL
246+
#define GAUDI2_DCORE1_EDMA0_SPMU_BASE 0x1000007FF1001000ULL
247+
#define GAUDI2_DCORE1_EDMA1_SPMU_BASE 0x1000007FF1201000ULL
248+
#define GAUDI2_DCORE2_EDMA0_SPMU_BASE 0x1000007FF2001000ULL
249+
#define GAUDI2_DCORE2_EDMA1_SPMU_BASE 0x1000007FF2201000ULL
250+
#define GAUDI2_DCORE3_EDMA0_SPMU_BASE 0x1000007FF3001000ULL
251+
#define GAUDI2_DCORE3_EDMA1_SPMU_BASE 0x1000007FF3201000ULL
252+
253+
/* PDMA SPMU base addresses */
254+
#define GAUDI2_PDMA0_SPMU_BASE 0x1000007FFC4A1000ULL
255+
#define GAUDI2_PDMA1_SPMU_BASE 0x1000007FFC4E1000ULL
256+
257+
/* Debug Operation Codes (from habanalabs_accel.h) */
258+
#define HL_DEBUG_OP_ETR 0
259+
#define HL_DEBUG_OP_ETF 1
260+
#define HL_DEBUG_OP_STM 2
261+
#define HL_DEBUG_OP_FUNNEL 3
262+
#define HL_DEBUG_OP_BMON 4
263+
#define HL_DEBUG_OP_SPMU 5
264+
#define HL_DEBUG_OP_TIMESTAMP 6
265+
#define HL_DEBUG_OP_SET_MODE 7
266+
#define HL_DEBUG_OP_FETCH_TRACE 8
267+
#define HL_DEBUG_OP_DIO 9
268+
#define HL_DEBUG_OP_READMEM 1024
269+
#define HL_DEBUG_OP_MEMCPY 1025
270+
#define HL_DEBUG_OP_SCHED_SUBMIT_BUF 1031
271+
#define HL_DEBUG_OP_READBLOCK 1032
272+
273+
#define HL_DEBUG_MAX_AUX_VALUES 10
274+
275+
/* Event Info Structure */
276+
typedef struct {
277+
const char *name; /* Event name (e.g., "TPC_KERNEL_EXECUTED") */
278+
const char *description;
279+
gaudi2_engine_type_t engine; /* Engine type */
280+
unsigned int event_id; /* Hardware event ID within engine */
281+
} gaudi2_native_event_t;
282+
283+
#endif /* _GAUDI2_EVENTS_H */

0 commit comments

Comments
 (0)