diff --git a/src/components/gaudi2/README.md b/src/components/gaudi2/README.md new file mode 100644 index 000000000..d6d703bcc --- /dev/null +++ b/src/components/gaudi2/README.md @@ -0,0 +1,22 @@ +# Gaudi2 Component + +The `gaudi2` component provides access to hardware performance counters on Intel Gaudi2 AI Accelerators through the SPMU interface. + +- [Environment Variables](#environment-variables) +- [Enabling the Gaudi2 Component](#enabling-the-gaudi2-component) + +## Environment Variables +The `gaudi2` component requires setting the `PAPI_GAUDI2_ROOT` environment variable to habanalabs installed directory for `hl-thunk` headers and libraries. + +```bash +export PAPI_GAUDI2_ROOT=/usr` +``` + +## Enabling the Gaudi2 Component + +To enable the `gaudi2` component, configure and build PAPI with the component enabled as follows: + +```bash +./configure --with-components="gaudi2" +make && make install +``` \ No newline at end of file diff --git a/src/components/gaudi2/Rules.gaudi2 b/src/components/gaudi2/Rules.gaudi2 new file mode 100644 index 000000000..ad4abc594 --- /dev/null +++ b/src/components/gaudi2/Rules.gaudi2 @@ -0,0 +1,30 @@ +# Rules for the Gaudi2 component +# Environment variables (REQUIRED): +# PAPI_GAUDI2_ROOT - Path to Habana installation root +# Example: /usr (for container builds with headers in /usr/include/habanalabs +# and libraries in /usr/lib/habanalabs) +# Expected directory structure under PAPI_GAUDI2_ROOT: +# $(PAPI_GAUDI2_ROOT)/include/habanalabs/hlthunk.h +# $(PAPI_GAUDI2_ROOT)/lib/habanalabs/libhl-thunk.so + +# Component source files +COMPSRCS += components/gaudi2/linux-gaudi2.c +COMPOBJS += linux-gaudi2.o + +# Default to /usr if not set +PAPI_GAUDI2_ROOT ?= /usr + +GAUDI2_INCDIR = $(PAPI_GAUDI2_ROOT)/include/habanalabs +GAUDI2_LIBDIR = $(PAPI_GAUDI2_ROOT)/lib/habanalabs + +# Compiler flags +# -I$(GAUDI2_INCDIR) for hlthunk.h and habanalabs_accel.h +CFLAGS += -I$(GAUDI2_INCDIR) -I$(PAPI_GAUDI2_ROOT)/include/drm -g + +# Linker flags +# Use dlopen for hlthunk library (no direct linking) +LDFLAGS += $(LDL) + +# Compilation rule +linux-gaudi2.o: components/gaudi2/linux-gaudi2.c components/gaudi2/gaudi2_events.h $(HEADERS) + $(CC) $(LIBCFLAGS) $(OPTFLAGS) -c components/gaudi2/linux-gaudi2.c -o linux-gaudi2.o diff --git a/src/components/gaudi2/gaudi2_events.h b/src/components/gaudi2/gaudi2_events.h new file mode 100644 index 000000000..a5f2be910 --- /dev/null +++ b/src/components/gaudi2/gaudi2_events.h @@ -0,0 +1,320 @@ +/** + * @file gaudi2_events.h + * + * @author Tokey Tahmid ttahmid@icl.utk.edu + */ + +#ifndef _GAUDI2_EVENTS_H +#define _GAUDI2_EVENTS_H + +typedef enum { + GAUDI2_ENGINE_TPC = 0, + GAUDI2_ENGINE_MME, + GAUDI2_ENGINE_EDMA, + GAUDI2_ENGINE_PDMA, + GAUDI2_ENGINE_ROTATOR, + GAUDI2_ENGINE_NIC, + GAUDI2_ENGINE_HBM, + GAUDI2_ENGINE_HMMU, + GAUDI2_ENGINE_MAX +} gaudi2_engine_type_t; + +/* Instance counts */ +#define GAUDI2_NUM_DCORES 4 +#define GAUDI2_TPC_PER_DCORE 6 /* 24 TPCs total */ +#define GAUDI2_EDMA_PER_DCORE 2 /* 8 EDMAs total */ +#define GAUDI2_MME_PER_DCORE 1 /* 4 MMEs total */ +#define GAUDI2_HMMU_PER_DCORE 4 /* 16 HMMUs total */ +#define GAUDI2_NUM_PDMA 2 +#define GAUDI2_NUM_ROTATOR 2 +#define GAUDI2_NUM_NIC 12 +#define GAUDI2_NUM_HBM 6 +#define GAUDI2_NUM_HBM_MC 2 /* 2 memory controllers per HBM */ + +#define GAUDI2_MAX_SPMU_COUNTERS 6 + +/* TPC SPMU events (81 events) */ +typedef enum { + TPC_SPMU_MEMORY2SB_BP = 0, + TPC_SPMU_SB2MEMORY_BP = 1, + TPC_SPMU_PQ_NOT_EMPTY_BUT_CQ_EMPTY = 2, + TPC_SPMU_QM_PREFETCH_BUFFER_EMPTY = 3, + TPC_SPMU_SB_2_CORE_BP = 4, + TPC_SPMU_SB_2_CORE_BP_SB_FULL = 5, + TPC_SPMU_SB_2_CORE_BP_SB_MEMORY = 6, + TPC_SPMU_SB_2_CORE_BP_SB_LD_TNSR_FIFO_FULL = 7, + TPC_SPMU_WB2CORE_BP = 8, + TPC_SPMU_STALL_ON_ICACHE_MISS = 9, + TPC_SPMU_STALL_ON_DCACHE_MISS = 10, + TPC_SPMU_STALL_ON_POP_FROM_SB = 11, + TPC_SPMU_STALL_ON_LOOKUP_CACHE_MISS = 12, + TPC_SPMU_STALL_ON_IRQ_FULL = 13, + TPC_SPMU_STALL_ON_MAX_COLORS = 14, + TPC_SPMU_STALL_ON_UARCH_BUBBLE = 15, + TPC_SPMU_STALL_VPU = 16, + TPC_SPMU_STALL_SPU_ANY = 17, + TPC_SPMU_STALL_ON_TSB_FULL = 18, + TPC_SPMU_STALL_ON_ST_L_EXT = 19, + TPC_SPMU_STALL_ON_LD_L_EXT = 20, + TPC_SPMU_STALL = 21, + + /* Opcode execution events - configurable via TPC_CFG_OPCODE_EXEC */ + TPC_SPMU_NUM_OF_OPCODE1_EXECUTED = 22, + TPC_SPMU_NUM_OF_OPCODE2_EXECUTED = 23, + TPC_SPMU_NUM_OF_OPCODE3_EXECUTED = 24, + TPC_SPMU_NUM_OF_OPCODE4_EXECUTED = 25, + + /* Execution events */ + TPC_SPMU_KERNEL_EXECUTED = 26, + TPC_SPMU_SCALAR_PIPE_EXEC = 27, + TPC_SPMU_VECTOR_PIPE_EXEC = 28, + + /* Cache events */ + TPC_SPMU_ICACHE_MISS = 29, + TPC_SPMU_ICACHE_HIT = 30, + TPC_SPMU_KILLED_INSTRUCTION = 31, + TPC_SPMU_LUT_MISS = 32, + TPC_SPMU_DCACHE_MISS = 33, + TPC_SPMU_DCACHE_HIT = 34, + + /* Out of bounds events */ + TPC_SPMU_OUT_OF_BOUND_DIM0 = 35, + TPC_SPMU_OUT_OF_BOUND_DIM1 = 36, + TPC_SPMU_OUT_OF_BOUND_DIM2 = 37, + TPC_SPMU_OUT_OF_BOUND_DIM3 = 38, + TPC_SPMU_OUT_OF_BOUND_DIM4 = 39, + + /* Arithmetic exception events */ + TPC_SPMU_DIV_BY_0 = 40, + TPC_SPMU_SPU_MAC_OVERFLOW = 41, + TPC_SPMU_SPU_ADDSUB_OVERFLOW = 42, + TPC_SPMU_SPU_ABS_OVERFLOW = 43, + TPC_SPMU_SPU_FMA_FP_DST_NAN = 44, + TPC_SPMU_SPU_FMA_FP_DST_INF = 45, + TPC_SPMU_SPU_CONVERT_FP_DST_NAN = 46, + TPC_SPMU_SPU_CONVERT_FP_DST_INF = 47, + TPC_SPMU_SPU_FP_DST_DENORM = 48, + TPC_SPMU_VPU_MAC_OVERFLOW = 49, + TPC_SPMU_VPU_ADDSUB_OVERFLOW = 50, + TPC_SPMU_VPU_ABS_OVERFLOW = 51, + TPC_SPMU_VPU_CONVERT_FP_DST_NAN = 52, + TPC_SPMU_VPU_CONVERT_FP_DST_INF = 53, + TPC_SPMU_VPU_FMA_FP_DST_NAN = 54, + TPC_SPMU_VPU_FMA_FP_DST_INF = 55, + TPC_SPMU_VPU_FP_DST_DENORM = 56, + + /* Additional events */ + TPC_SPMU_STALL_ON_ST_TSNR_FULL = 57, + TPC_SPMU_LUT_HIT = 58, + TPC_SPMU_ADDRESS_EXCEED_VLM = 59, + TPC_SPMU_LD_LOCK_RESEND = 60, + TPC_SPMU_LD_L_PROT_VIO = 61, + TPC_SPMU_ST_L_PROT_VIO = 62, + TPC_SPMU_DCACHE_L0CD_MISMATCH = 63, + TPC_SPMU_TPC_STALL_ON_LD_L_INT = 64, + TPC_SPMU_SB_FIRST_RESPONSE = 65, + TPC_SPMU_SB_LAST_RESPONSE = 66, + + /* SB occupancy events */ + TPC_SPMU_SB_OCCUPANCY0 = 67, + TPC_SPMU_SB_OCCUPANCY1 = 68, + TPC_SPMU_SB_OCCUPANCY2 = 69, + TPC_SPMU_SB_OCCUPANCY3 = 70, + + /* SB CAM events */ + TPC_SPMU_SB_DBG_CAM0_MISS = 71, + TPC_SPMU_SB_DBG_CAM0_HIT = 72, + TPC_SPMU_SB_DBG_CAM0_UNCACHEABLE = 73, + TPC_SPMU_SB_DBG_CAM1_MISS = 74, + TPC_SPMU_SB_DBG_CAM1_HIT = 75, + TPC_SPMU_SB_DBG_CAM1_UNCACHEABLE = 76, + + /* Additional cache events */ + TPC_SPMU_NOC_2_SB_BP = 77, + TPC_SPMU_DCACHE_HW_PREF = 78, + TPC_SPMU_DCACHE_UC = 79, + TPC_SPMU_DCACHE_DEALIGN = 80, + + TPC_SPMU_EVENT_MAX = 81 +} gaudi2_tpc_spmu_event_t; + +/* EDMA SPMU Events (50 events, IDs 0-49) */ +typedef enum { + EDMA_SPMU_QMAN0_PQ_BUF_PEND_CNT_EN = 0, + EDMA_SPMU_QMAN1_PQ_BUF_PEND_CNT_EN = 1, + EDMA_SPMU_QMAN2_PQ_BUF_PEND_CNT_EN = 2, + EDMA_SPMU_QMAN3_PQ_BUF_PEND_CNT_EN = 3, + EDMA_SPMU_QMAN0_CQ_BUF_PEND_CNT_EN = 4, + EDMA_SPMU_QMAN1_CQ_BUF_PEND_CNT_EN = 5, + EDMA_SPMU_QMAN2_CQ_BUF_PEND_CNT_EN = 6, + EDMA_SPMU_QMAN3_CQ_BUF_PEND_CNT_EN = 7, + EDMA_SPMU_QMAN_CMDQ_CQ_BUF_PEND_CNT_EN = 8, + EDMA_SPMU_QMAN_CMDQ_ARC_CQ_BUF_PEND_CNT_EN = 9, + EDMA_SPMU_AXI_HBW_ERR = 10, + EDMA_SPMU_AXI_LBW_ERR = 11, + EDMA_SPMU_TRACE_FENCE_START = 12, + EDMA_SPMU_TRACE_FENCE_DONE = 13, + EDMA_SPMU_TRACE_CP_SW_STOP = 14, + EDMA_SPMU_CP_ERR = 15, + EDMA_SPMU_ARB_ERR = 16, + EDMA_SPMU_TRACE_CHOICE_WIN_PUSH = 17, + EDMA_SPMU_DBG_DMA_TRC_DESC_PUSH = 18, + EDMA_SPMU_DBG_DMA_TRC_CPL_MSG_SENT = 19, + EDMA_SPMU_DBG_DMA_TRC_RD_FRST_ADDR_PUSH = 20, + EDMA_SPMU_DBG_DMA_TRC_RD_LAST_ADDR_PUSH = 21, + EDMA_SPMU_DBG_DMA_TRC_WR_FRST_ADDR_PUSH = 22, + EDMA_SPMU_DBG_DMA_TRC_WR_LAST_ADDR_PUSH = 23, + EDMA_SPMU_DBG_DMA_TRC_RD_DATA_FRST = 24, + EDMA_SPMU_DBG_DMA_TRC_RD_DATA_LAST = 25, + EDMA_SPMU_DBG_DMA_TRC_WR_DATA_FRST = 26, + EDMA_SPMU_DBG_DMA_TRC_WR_DATA_LAST = 27, + EDMA_SPMU_DBG_DMA_SPMU_MESH2SB_BP = 28, + EDMA_SPMU_DBG_DMA_SPMU_SB2MESH_BP = 29, + EDMA_SPMU_DBG_DMA_SPMU_MESH2WB_BP = 30, + EDMA_SPMU_DBG_DMA_SPMU_RD_CTX_END2START = 31, + EDMA_SPMU_DBG_DMA_SPMU_WR_CTX_END2START = 32, + EDMA_SPMU_DBG_DMA_SPMU_SB2AGU_BP = 33, + EDMA_SPMU_DBG_DMA_SPMU_SB_FULL_BP = 34, + EDMA_SPMU_DBG_DMA_SPMU_WB2AGU_BP = 35, + EDMA_SPMU_DBG_DMA_SPMU_WB2GSKT_BP = 36, + EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_0 = 37, + EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_1 = 38, + EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_2 = 39, + EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_3 = 40, + EDMA_SPMU_SB_2_INITIATOR_BP_SB_FULL = 41, + EDMA_SPMU_SB_2_INITIATOR_BP = 42, + EDMA_SPMU_SB_DBG_CAM0_MISS = 43, + EDMA_SPMU_SB_DBG_CAM0_HIT = 44, + EDMA_SPMU_SB_DBG_CAM0_UNCACHEABLE = 45, + EDMA_SPMU_SB_DBG_CAM1_MISS = 46, + EDMA_SPMU_SB_DBG_CAM1_HIT = 47, + EDMA_SPMU_SB_DBG_CAM1_UNCACHEABLE = 48, + EDMA_SPMU_SB_AXI_NOC_2_SB_BP = 49, + + EDMA_SPMU_EVENT_MAX = 50 +} gaudi2_edma_spmu_event_t; + +/* MME CTRL SPMU Events (8 events, IDs 0-7) */ +typedef enum { + MME_CTRL_SPMU_CONV_END_STALL_DIAG = 0, + MME_CTRL_SPMU_CONV_END_STALL_ACC = 1, + MME_CTRL_SPMU_CONV_END_STALL_DIAG_STALL_ACC = 2, + MME_CTRL_SPMU_OUTER_PRODUCT_STALL_ON_B = 3, + MME_CTRL_SPMU_OUTER_PRODUCT_STALL_ON_A = 4, + MME_CTRL_SPMU_NUM_OUTER_PRODUCTS = 5, + MME_CTRL_SPMU_QM_PREFETCH_BUFFER_EMPTY = 6, + MME_CTRL_SPMU_PQ_NOT_EMPTY_BUT_CQ_EMPTY = 7, + + MME_CTRL_SPMU_EVENT_MAX = 8 +} gaudi2_mme_ctrl_spmu_event_t; + +/* PDMA SPMU Events (50 events, IDs 0-49) */ +typedef enum { + PDMA_SPMU_QMAN0_PQ_BUF_PEND_CNT_EN = 0, + PDMA_SPMU_QMAN1_PQ_BUF_PEND_CNT_EN = 1, + PDMA_SPMU_QMAN2_PQ_BUF_PEND_CNT_EN = 2, + PDMA_SPMU_QMAN3_PQ_BUF_PEND_CNT_EN = 3, + PDMA_SPMU_QMAN0_CQ_BUF_PEND_CNT_EN = 4, + PDMA_SPMU_QMAN1_CQ_BUF_PEND_CNT_EN = 5, + PDMA_SPMU_QMAN2_CQ_BUF_PEND_CNT_EN = 6, + PDMA_SPMU_QMAN3_CQ_BUF_PEND_CNT_EN = 7, + PDMA_SPMU_QMAN_CMDQ_CQ_BUF_PEND_CNT_EN = 8, + PDMA_SPMU_QMAN_CMDQ_ARC_CQ_BUF_PEND_CNT_EN = 9, + PDMA_SPMU_AXI_HBW_ERR = 10, + PDMA_SPMU_AXI_LBW_ERR = 11, + PDMA_SPMU_TRACE_FENCE_START = 12, + PDMA_SPMU_TRACE_FENCE_DONE = 13, + PDMA_SPMU_TRACE_CP_SW_STOP = 14, + PDMA_SPMU_CP_ERR = 15, + PDMA_SPMU_ARB_ERR = 16, + PDMA_SPMU_TRACE_CHOICE_WIN_PUSH = 17, + PDMA_SPMU_DBG_DMA_TRC_DESC_PUSH = 18, + PDMA_SPMU_DBG_DMA_TRC_CPL_MSG_SENT = 19, + PDMA_SPMU_DBG_DMA_TRC_RD_FRST_ADDR_PUSH = 20, + PDMA_SPMU_DBG_DMA_TRC_RD_LAST_ADDR_PUSH = 21, + PDMA_SPMU_DBG_DMA_TRC_WR_FRST_ADDR_PUSH = 22, + PDMA_SPMU_DBG_DMA_TRC_WR_LAST_ADDR_PUSH = 23, + PDMA_SPMU_DBG_DMA_TRC_RD_DATA_FRST = 24, + PDMA_SPMU_DBG_DMA_TRC_RD_DATA_LAST = 25, + PDMA_SPMU_DBG_DMA_TRC_WR_DATA_FRST = 26, + PDMA_SPMU_DBG_DMA_TRC_WR_DATA_LAST = 27, + PDMA_SPMU_DBG_DMA_SPMU_MESH2SB_BP = 28, + PDMA_SPMU_DBG_DMA_SPMU_SB2MESH_BP = 29, + PDMA_SPMU_DBG_DMA_SPMU_MESH2WB_BP = 30, + PDMA_SPMU_DBG_DMA_SPMU_RD_CTX_END2START = 31, + PDMA_SPMU_DBG_DMA_SPMU_WR_CTX_END2START = 32, + PDMA_SPMU_DBG_DMA_SPMU_SB2AGU_BP = 33, + PDMA_SPMU_DBG_DMA_SPMU_SB_FULL_BP = 34, + PDMA_SPMU_DBG_DMA_SPMU_WB2AGU_BP = 35, + PDMA_SPMU_DBG_DMA_SPMU_WB2GSKT_BP = 36, + PDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_0 = 37, + PDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_1 = 38, + PDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_2 = 39, + PDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_3 = 40, + PDMA_SPMU_SB_2_INITIATOR_BP_SB_FULL = 41, + PDMA_SPMU_SB_2_INITIATOR_BP = 42, + PDMA_SPMU_SB_DBG_CAM0_MISS = 43, + PDMA_SPMU_SB_DBG_CAM0_HIT = 44, + PDMA_SPMU_SB_DBG_CAM0_UNCACHEABLE = 45, + PDMA_SPMU_SB_DBG_CAM1_MISS = 46, + PDMA_SPMU_SB_DBG_CAM1_HIT = 47, + PDMA_SPMU_SB_DBG_CAM1_UNCACHEABLE = 48, + PDMA_SPMU_SB_AXI_NOC_2_SB_BP = 49, + + PDMA_SPMU_EVENT_MAX = 50 +} gaudi2_pdma_spmu_event_t; + +/* SPMU Base Addresses */ + +/* TPC SPMU base addresses - pattern: DCORE + TPC offset + SPMU offset */ +#define GAUDI2_DCORE0_TPC0_SPMU_BASE 0x1000007FF8001000ULL +#define GAUDI2_DCORE0_TPC1_SPMU_BASE 0x1000007FF8201000ULL +#define GAUDI2_DCORE0_TPC2_SPMU_BASE 0x1000007FF8401000ULL +#define GAUDI2_DCORE0_TPC3_SPMU_BASE 0x1000007FF8601000ULL +#define GAUDI2_DCORE0_TPC4_SPMU_BASE 0x1000007FF8801000ULL +#define GAUDI2_DCORE0_TPC5_SPMU_BASE 0x1000007FF8A01000ULL + +#define GAUDI2_DCORE1_TPC0_SPMU_BASE 0x1000007FF9001000ULL +#define GAUDI2_DCORE1_TPC1_SPMU_BASE 0x1000007FF9201000ULL +#define GAUDI2_DCORE1_TPC2_SPMU_BASE 0x1000007FF9401000ULL +#define GAUDI2_DCORE1_TPC3_SPMU_BASE 0x1000007FF9601000ULL +#define GAUDI2_DCORE1_TPC4_SPMU_BASE 0x1000007FF9801000ULL +#define GAUDI2_DCORE1_TPC5_SPMU_BASE 0x1000007FF9A01000ULL + +#define GAUDI2_DCORE2_TPC0_SPMU_BASE 0x1000007FFA001000ULL +#define GAUDI2_DCORE2_TPC1_SPMU_BASE 0x1000007FFA201000ULL +#define GAUDI2_DCORE2_TPC2_SPMU_BASE 0x1000007FFA401000ULL +#define GAUDI2_DCORE2_TPC3_SPMU_BASE 0x1000007FFA601000ULL +#define GAUDI2_DCORE2_TPC4_SPMU_BASE 0x1000007FFA801000ULL +#define GAUDI2_DCORE2_TPC5_SPMU_BASE 0x1000007FFAA01000ULL + +#define GAUDI2_DCORE3_TPC0_SPMU_BASE 0x1000007FFB001000ULL +#define GAUDI2_DCORE3_TPC1_SPMU_BASE 0x1000007FFB201000ULL +#define GAUDI2_DCORE3_TPC2_SPMU_BASE 0x1000007FFB401000ULL +#define GAUDI2_DCORE3_TPC3_SPMU_BASE 0x1000007FFB601000ULL +#define GAUDI2_DCORE3_TPC4_SPMU_BASE 0x1000007FFB801000ULL +#define GAUDI2_DCORE3_TPC5_SPMU_BASE 0x1000007FFBA01000ULL + +/* EDMA SPMU base addresses */ +#define GAUDI2_DCORE0_EDMA0_SPMU_BASE 0x1000007FF0001000ULL +#define GAUDI2_DCORE0_EDMA1_SPMU_BASE 0x1000007FF0201000ULL +#define GAUDI2_DCORE1_EDMA0_SPMU_BASE 0x1000007FF1001000ULL +#define GAUDI2_DCORE1_EDMA1_SPMU_BASE 0x1000007FF1201000ULL +#define GAUDI2_DCORE2_EDMA0_SPMU_BASE 0x1000007FF2001000ULL +#define GAUDI2_DCORE2_EDMA1_SPMU_BASE 0x1000007FF2201000ULL +#define GAUDI2_DCORE3_EDMA0_SPMU_BASE 0x1000007FF3001000ULL +#define GAUDI2_DCORE3_EDMA1_SPMU_BASE 0x1000007FF3201000ULL + +/* PDMA SPMU base addresses */ +#define GAUDI2_PDMA0_SPMU_BASE 0x1000007FFC4A1000ULL +#define GAUDI2_PDMA1_SPMU_BASE 0x1000007FFC4E1000ULL + +/* Event Info Structure */ +typedef struct { + const char *name; /* Event name (e.g., "TPC_KERNEL_EXECUTED") */ + const char *description; + gaudi2_engine_type_t engine; /* Engine type */ + unsigned int event_id; /* Hardware event ID within engine */ +} gaudi2_native_event_t; + +#endif /* _GAUDI2_EVENTS_H */ diff --git a/src/components/gaudi2/linux-gaudi2.c b/src/components/gaudi2/linux-gaudi2.c new file mode 100644 index 000000000..065788614 --- /dev/null +++ b/src/components/gaudi2/linux-gaudi2.c @@ -0,0 +1,1504 @@ +/** + * @file linux-gaudi2.c + * + * @author Tokey Tahmid ttahmid@icl.utk.edu + * + * @ingroup papi_components + * + * @brief + * This file implements a PAPI component for the Intel Gaudi2 SPMU counters + * Accesses hardware performance counters on Gaudi2 AI accelerators + * via the hlthunk library debug interface. + * + * The open source software license for PAPI conforms to the BSD + * License template. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "papi.h" +#include "papi_internal.h" +#include "papi_vector.h" +#include "papi_memory.h" + +#include "gaudi2_events.h" + +#define GAUDI2_MAX_COUNTERS 32 +#define GAUDI2_MAX_DEVICES 16 /* Max supported devices per node */ + +/* Eventset status flags */ +#define GAUDI2_EVENTS_STOPPED (0x0) +#define GAUDI2_EVENTS_RUNNING (0x2) + +/* Event code encoding: + * Bits 0-7: name ID (base event index in catalog) + * Bits 8-15: device index (0-255) + * Bits 16-23: flags (DEVICE_FLAG for device qualifier display) + * + * When flags=0: base event (for PAPI_ENUM_EVENTS enumeration) + * When flags=DEVICE_FLAG: device qualifier entry (for PAPI_NTV_ENUM_UMASKS) + */ +#define GAUDI2_NAMEID_SHIFT 0 +#define GAUDI2_NAMEID_WIDTH 8 +#define GAUDI2_DEVICE_SHIFT 8 +#define GAUDI2_DEVICE_WIDTH 8 +#define GAUDI2_FLAGS_SHIFT 16 +#define GAUDI2_FLAGS_WIDTH 8 + +#define GAUDI2_NAMEID_MASK ((0xFF) << GAUDI2_NAMEID_SHIFT) +#define GAUDI2_DEVICE_MASK ((0xFF) << GAUDI2_DEVICE_SHIFT) +#define GAUDI2_FLAGS_MASK ((0xFF) << GAUDI2_FLAGS_SHIFT) + +/* Flag definitions */ +#define GAUDI2_DEVICE_FLAG 0x1 /* Device qualifier entry */ + +/* Event info structure for encoding/decoding */ +typedef struct { + int nameid; /* Index in catalog */ + int device; /* Device index */ + int flags; /* GAUDI2_DEVICE_FLAG or 0 */ +} gaudi2_event_info_t; + +static int gaudi2_evt_id_create(gaudi2_event_info_t *info, unsigned int *event_code) +{ + *event_code = (unsigned int)(info->nameid << GAUDI2_NAMEID_SHIFT); + *event_code |= (unsigned int)(info->device << GAUDI2_DEVICE_SHIFT); + *event_code |= (unsigned int)(info->flags << GAUDI2_FLAGS_SHIFT); + return PAPI_OK; +} + +static int gaudi2_evt_id_to_info(unsigned int event_code, gaudi2_event_info_t *info) +{ + info->nameid = (event_code & GAUDI2_NAMEID_MASK) >> GAUDI2_NAMEID_SHIFT; + info->device = (event_code & GAUDI2_DEVICE_MASK) >> GAUDI2_DEVICE_SHIFT; + info->flags = (event_code & GAUDI2_FLAGS_MASK) >> GAUDI2_FLAGS_SHIFT; + return PAPI_OK; +} + +/* hlthunk library header - provides: + * struct hl_debug_args, hl_debug_params_spmu, hlthunk_hw_ip_info + * enum hlthunk_device_name (HLTHUNK_DEVICE_GAUDI2, etc.) + * HLTHUNK_NODE_PRIMARY, HLTHUNK_NODE_CONTROL, HLTHUNK_MAX_MINOR + * HL_DEBUG_OP_*, HL_DEBUG_MAX_AUX_VALUES + * Function declarations for hlthunk_open, hlthunk_debug, etc. + */ +#include "hlthunk.h" + +/* Function pointer types for dlsym */ +typedef int (*hlthunk_open_fn)(enum hlthunk_device_name device_name, const char *busid); +typedef int (*hlthunk_close_fn)(int fd); +typedef int (*hlthunk_debug_fn)(int fd, struct hl_debug_args *debug); +typedef enum hlthunk_device_name (*hlthunk_get_device_name_from_fd_fn)(int fd); +typedef int (*hlthunk_get_hw_ip_info_fn)(int fd, struct hlthunk_hw_ip_info *hw_ip); +typedef int (*hlthunk_get_device_count_fn)(enum hlthunk_device_name device_name); + +static void *hlthunk_handle = NULL; + +static hlthunk_open_fn p_hlthunk_open = NULL; +static hlthunk_close_fn p_hlthunk_close = NULL; +static hlthunk_debug_fn p_hlthunk_debug = NULL; +static hlthunk_get_device_name_from_fd_fn p_hlthunk_get_device_name_from_fd = NULL; +static hlthunk_get_hw_ip_info_fn p_hlthunk_get_hw_ip_info = NULL; +static hlthunk_get_device_count_fn p_hlthunk_get_device_count = NULL; + +/* Per-device state */ +typedef struct { + int device_idx; /* Index in device array (0 to num_devices-1) */ + int device_fd; /* File descriptor for this device */ + int owns_fd; /* 1 if opened fd, 0 if borrowed from runtime */ + int device_type; /* HLTHUNK_DEVICE_GAUDI2/2B/2C/2D */ + struct hlthunk_hw_ip_info hw_ip; /* Hardware IP info */ + int tpc_avail; /* TPC engine available */ + int edma_avail; /* EDMA engine available */ + int mme_avail; /* MME engine available */ + int pdma_avail; /* PDMA engine available */ +} gaudi2_device_t; + +/* Device table - populated during init */ +static gaudi2_device_t *gaudi2_devices = NULL; +static int gaudi2_num_devices = 0; + +/** +* Event Catalog +* TODO: add all gaudi2 events +*/ +static gaudi2_native_event_t gaudi2_event_catalog[] = { + /* ===== TPC SPMU events (81 events) ===== */ + + /* TPC backpressure (IDs 0-8) */ + {"TPC_MEMORY2SB_BP", "Back pressure from memory to TPC suspension buffer on read requests", GAUDI2_ENGINE_TPC, TPC_SPMU_MEMORY2SB_BP}, + {"TPC_SB2MEMORY_BP", "Back pressure from TPC suspension buffer to memory on read responses", GAUDI2_ENGINE_TPC, TPC_SPMU_SB2MEMORY_BP}, + {"TPC_PQ_NOT_EMPTY_BUT_CQ_EMPTY", "CQ FIFO pointers signal not empty but CQ buffer is empty waiting for data", GAUDI2_ENGINE_TPC, TPC_SPMU_PQ_NOT_EMPTY_BUT_CQ_EMPTY}, + {"TPC_QM_PREFETCH_BUFFER_EMPTY", "PQ FIFO pointers signal not empty but PQ buffer is empty waiting for data", GAUDI2_ENGINE_TPC, TPC_SPMU_QM_PREFETCH_BUFFER_EMPTY}, + {"TPC_SB_2_CORE_BP", "Internal back pressure from the suspension buffer to the core", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_2_CORE_BP}, + {"TPC_SB_2_CORE_BP_SB_FULL", "Internal back pressure from the suspension buffer to the core due to full", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_2_CORE_BP_SB_FULL}, + {"TPC_SB_2_CORE_BP_SB_MEMORY", "Back pressure from the suspension buffer to the core due to memory back pressure", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_2_CORE_BP_SB_MEMORY}, + {"TPC_SB_2_CORE_BP_SB_LD_TNSR_FIFO_FULL", "Internal back pressure from VPU to TPC SB due to response fifo full", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_2_CORE_BP_SB_LD_TNSR_FIFO_FULL}, + {"TPC_WB2CORE_BP", "Back pressure from memory to TPC write queue on store tensors", GAUDI2_ENGINE_TPC, TPC_SPMU_WB2CORE_BP}, + + /* TPC stalls (IDs 9-21) */ + {"TPC_STALL_ON_ICACHE_MISS", "Scalar pipe stall due to instruction cache miss", GAUDI2_ENGINE_TPC, TPC_SPMU_STALL_ON_ICACHE_MISS}, + {"TPC_STALL_ON_DCACHE_MISS", "Scalar pipe stall due to data cache miss", GAUDI2_ENGINE_TPC, TPC_SPMU_STALL_ON_DCACHE_MISS}, + {"TPC_STALL_ON_POP_FROM_SB", "VPU stall on ld_tnsr", GAUDI2_ENGINE_TPC, TPC_SPMU_STALL_ON_POP_FROM_SB}, + {"TPC_STALL_ON_LOOKUP_CACHE_MISS", "Vector pipe stall due to lookup tables miss", GAUDI2_ENGINE_TPC, TPC_SPMU_STALL_ON_LOOKUP_CACHE_MISS}, + {"TPC_STALL_ON_IRQ_FULL", "Scalar pipe stall due to instruction queue full", GAUDI2_ENGINE_TPC, TPC_SPMU_STALL_ON_IRQ_FULL}, + {"TPC_STALL_ON_MAX_COLORS", "Vector pipe stall due to more than 4 kernels/ASOs running", GAUDI2_ENGINE_TPC, TPC_SPMU_STALL_ON_MAX_COLORS}, + {"TPC_STALL_ON_UARCH_BUBBLE", "Vector pipe stall due to instruction bubble during kernel", GAUDI2_ENGINE_TPC, TPC_SPMU_STALL_ON_UARCH_BUBBLE}, + {"TPC_STALL_VPU", "General vector pipe stall indication", GAUDI2_ENGINE_TPC, TPC_SPMU_STALL_VPU}, + {"TPC_STALL_SPU_ANY", "General scalar pipe stall indication", GAUDI2_ENGINE_TPC, TPC_SPMU_STALL_SPU_ANY}, + {"TPC_STALL_ON_TSB_FULL", "Scalar pipe stall due to suspension buffer request buffer full", GAUDI2_ENGINE_TPC, TPC_SPMU_STALL_ON_TSB_FULL}, + {"TPC_STALL_ON_ST_L_EXT", "Scalar pipe stall due to config write to external agent", GAUDI2_ENGINE_TPC, TPC_SPMU_STALL_ON_ST_L_EXT}, + {"TPC_STALL_ON_LD_L_EXT", "Scalar pipe stall due to config read from external agent", GAUDI2_ENGINE_TPC, TPC_SPMU_STALL_ON_LD_L_EXT}, + {"TPC_STALL", "TPC stall", GAUDI2_ENGINE_TPC, TPC_SPMU_STALL}, + + /* TPC opcode execution (IDs 22-25) */ + {"TPC_NUM_OF_OPCODE1_EXECUTED", "How many times a pre-configured instruction opcode1 was executed", GAUDI2_ENGINE_TPC, TPC_SPMU_NUM_OF_OPCODE1_EXECUTED}, + {"TPC_NUM_OF_OPCODE2_EXECUTED", "How many times a pre-configured instruction opcode2 was executed", GAUDI2_ENGINE_TPC, TPC_SPMU_NUM_OF_OPCODE2_EXECUTED}, + {"TPC_NUM_OF_OPCODE3_EXECUTED", "How many times a pre-configured instruction opcode3 was executed", GAUDI2_ENGINE_TPC, TPC_SPMU_NUM_OF_OPCODE3_EXECUTED}, + {"TPC_NUM_OF_OPCODE4_EXECUTED", "How many times a pre-configured instruction opcode4 was executed", GAUDI2_ENGINE_TPC, TPC_SPMU_NUM_OF_OPCODE4_EXECUTED}, + + /* TPC execution (IDs 26-28) */ + {"TPC_KERNEL_EXECUTED", "End of kernel on the vector pipe indication", GAUDI2_ENGINE_TPC, TPC_SPMU_KERNEL_EXECUTED}, + {"TPC_SCALAR_PIPE_EXEC", "Active status of scalar pipe (not including stalls)", GAUDI2_ENGINE_TPC, TPC_SPMU_SCALAR_PIPE_EXEC}, + {"TPC_VECTOR_PIPE_EXEC", "Active status of vector pipe (not including stalls)", GAUDI2_ENGINE_TPC, TPC_SPMU_VECTOR_PIPE_EXEC}, + + /* TPC cache (IDs 29-34) */ + {"TPC_ICACHE_MISS", "Miss on requested instruction fetch", GAUDI2_ENGINE_TPC, TPC_SPMU_ICACHE_MISS}, + {"TPC_ICACHE_HIT", "Hit on requested instruction fetch", GAUDI2_ENGINE_TPC, TPC_SPMU_ICACHE_HIT}, + {"TPC_KILLED_INSTRUCTION", "Scalar pipe indication for jumping over instructions not executed", GAUDI2_ENGINE_TPC, TPC_SPMU_KILLED_INSTRUCTION}, + {"TPC_LUT_MISS", "Miss on lookup table read instruction", GAUDI2_ENGINE_TPC, TPC_SPMU_LUT_MISS}, + {"TPC_DCACHE_MISS", "Miss on requested data fetch from cache", GAUDI2_ENGINE_TPC, TPC_SPMU_DCACHE_MISS}, + {"TPC_DCACHE_HIT", "Hit on requested data fetch from cache", GAUDI2_ENGINE_TPC, TPC_SPMU_DCACHE_HIT}, + + /* TPC out of bounds (IDs 35-39) */ + {"TPC_OUT_OF_BOUND_DIM0", "Load tensor was out of the tensor bound on dimension0", GAUDI2_ENGINE_TPC, TPC_SPMU_OUT_OF_BOUND_DIM0}, + {"TPC_OUT_OF_BOUND_DIM1", "Load tensor was out of the tensor bound on dimension1", GAUDI2_ENGINE_TPC, TPC_SPMU_OUT_OF_BOUND_DIM1}, + {"TPC_OUT_OF_BOUND_DIM2", "Load tensor was out of the tensor bound on dimension2", GAUDI2_ENGINE_TPC, TPC_SPMU_OUT_OF_BOUND_DIM2}, + {"TPC_OUT_OF_BOUND_DIM3", "Load tensor was out of the tensor bound on dimension3", GAUDI2_ENGINE_TPC, TPC_SPMU_OUT_OF_BOUND_DIM3}, + {"TPC_OUT_OF_BOUND_DIM4", "Load tensor was out of the tensor bound on dimension4", GAUDI2_ENGINE_TPC, TPC_SPMU_OUT_OF_BOUND_DIM4}, + + /* TPC arithmetic exceptions (IDs 40-56) */ + {"TPC_DIV_BY_0", "Divider zero on Div instruction on the scalar pipe", GAUDI2_ENGINE_TPC, TPC_SPMU_DIV_BY_0}, + {"TPC_SPU_MAC_OVERFLOW", "Scalar pipe overflow/underflow indication on MAC instruction", GAUDI2_ENGINE_TPC, TPC_SPMU_SPU_MAC_OVERFLOW}, + {"TPC_SPU_ADDSUB_OVERFLOW", "Scalar pipe overflow/underflow indication on ADD/SUB instruction", GAUDI2_ENGINE_TPC, TPC_SPMU_SPU_ADDSUB_OVERFLOW}, + {"TPC_SPU_ABS_OVERFLOW", "Scalar pipe overflow indication on Absolute instruction", GAUDI2_ENGINE_TPC, TPC_SPMU_SPU_ABS_OVERFLOW}, + {"TPC_SPU_FMA_FP_DST_NAN", "Scalar pipe floating point result is Not-a-Number", GAUDI2_ENGINE_TPC, TPC_SPMU_SPU_FMA_FP_DST_NAN}, + {"TPC_SPU_FMA_FP_DST_INF", "Scalar pipe floating point result is infinity", GAUDI2_ENGINE_TPC, TPC_SPMU_SPU_FMA_FP_DST_INF}, + {"TPC_SPU_CONVERT_FP_DST_NAN", "Scalar pipe convert result is Not-a-Number", GAUDI2_ENGINE_TPC, TPC_SPMU_SPU_CONVERT_FP_DST_NAN}, + {"TPC_SPU_CONVERT_FP_DST_INF", "Scalar pipe convert result is infinity", GAUDI2_ENGINE_TPC, TPC_SPMU_SPU_CONVERT_FP_DST_INF}, + {"TPC_SPU_FP_DST_DENORM", "Scalar pipe floating point result is a denormalized number", GAUDI2_ENGINE_TPC, TPC_SPMU_SPU_FP_DST_DENORM}, + {"TPC_VPU_MAC_OVERFLOW", "VPU overflow/underflow indication on MAC instruction", GAUDI2_ENGINE_TPC, TPC_SPMU_VPU_MAC_OVERFLOW}, + {"TPC_VPU_ADDSUB_OVERFLOW", "VPU overflow/underflow indication on ADD/SUB instruction", GAUDI2_ENGINE_TPC, TPC_SPMU_VPU_ADDSUB_OVERFLOW}, + {"TPC_VPU_ABS_OVERFLOW", "VPU floating point result is Not-a-Number or infinite", GAUDI2_ENGINE_TPC, TPC_SPMU_VPU_ABS_OVERFLOW}, + {"TPC_VPU_CONVERT_FP_DST_NAN", "VPU convert floating point result is Not-a-Number", GAUDI2_ENGINE_TPC, TPC_SPMU_VPU_CONVERT_FP_DST_NAN}, + {"TPC_VPU_CONVERT_FP_DST_INF", "VPU convert floating point result is infinite", GAUDI2_ENGINE_TPC, TPC_SPMU_VPU_CONVERT_FP_DST_INF}, + {"TPC_VPU_FMA_FP_DST_NAN", "VPU FMA result is Not-a-Number", GAUDI2_ENGINE_TPC, TPC_SPMU_VPU_FMA_FP_DST_NAN}, + {"TPC_VPU_FMA_FP_DST_INF", "VPU FMA result is infinite", GAUDI2_ENGINE_TPC, TPC_SPMU_VPU_FMA_FP_DST_INF}, + {"TPC_VPU_FP_DST_DENORM", "VPU floating point result is a denormalized number", GAUDI2_ENGINE_TPC, TPC_SPMU_VPU_FP_DST_DENORM}, + + /* TPC additional events (IDs 57-66) */ + {"TPC_STALL_ON_ST_TSNR_FULL", "Stall due to store tensor full", GAUDI2_ENGINE_TPC, TPC_SPMU_STALL_ON_ST_TSNR_FULL}, + {"TPC_LUT_HIT", "VPE/SPE lookup table hit", GAUDI2_ENGINE_TPC, TPC_SPMU_LUT_HIT}, + {"TPC_ADDRESS_EXCEED_VLM", "Address exceeded the local memory", GAUDI2_ENGINE_TPC, TPC_SPMU_ADDRESS_EXCEED_VLM}, + {"TPC_LD_LOCK_RESEND", "Exclusive access - access to an already locked region", GAUDI2_ENGINE_TPC, TPC_SPMU_LD_LOCK_RESEND}, + {"TPC_LD_L_PROT_VIO", "Exclusive access - load lock protection violation", GAUDI2_ENGINE_TPC, TPC_SPMU_LD_L_PROT_VIO}, + {"TPC_ST_L_PROT_VIO", "Exclusive access - store unlock protection violation", GAUDI2_ENGINE_TPC, TPC_SPMU_ST_L_PROT_VIO}, + {"TPC_DCACHE_L0CD_MISMATCH", "D$ L0CD mismatch on unaligned access", GAUDI2_ENGINE_TPC, TPC_SPMU_DCACHE_L0CD_MISMATCH}, + {"TPC_STALL_ON_LD_L_INT", "Stall due to ld_l from TPC internal address space", GAUDI2_ENGINE_TPC, TPC_SPMU_TPC_STALL_ON_LD_L_INT}, + {"TPC_SB_FIRST_RESPONSE", "SB first response received", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_FIRST_RESPONSE}, + {"TPC_SB_LAST_RESPONSE", "SB last response received", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_LAST_RESPONSE}, + + /* TPC SB occupancy (IDs 67-70) */ + {"TPC_SB_OCCUPANCY0", "SB occupancy: full > occupancy > 3/4", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_OCCUPANCY0}, + {"TPC_SB_OCCUPANCY1", "SB occupancy: 3/4 > occupancy > 1/2", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_OCCUPANCY1}, + {"TPC_SB_OCCUPANCY2", "SB occupancy: 1/2 > occupancy > 1/4", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_OCCUPANCY2}, + {"TPC_SB_OCCUPANCY3", "SB occupancy: 1/4 > occupancy > empty", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_OCCUPANCY3}, + + /* TPC SB CAM (IDs 71-76) */ + {"TPC_SB_DBG_CAM0_MISS", "SB debug CAM0 miss", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_DBG_CAM0_MISS}, + {"TPC_SB_DBG_CAM0_HIT", "SB debug CAM0 hit", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_DBG_CAM0_HIT}, + {"TPC_SB_DBG_CAM0_UNCACHEABLE", "SB debug CAM0 uncacheable", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_DBG_CAM0_UNCACHEABLE}, + {"TPC_SB_DBG_CAM1_MISS", "SB debug CAM1 miss", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_DBG_CAM1_MISS}, + {"TPC_SB_DBG_CAM1_HIT", "SB debug CAM1 hit", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_DBG_CAM1_HIT}, + {"TPC_SB_DBG_CAM1_UNCACHEABLE", "SB debug CAM1 uncacheable", GAUDI2_ENGINE_TPC, TPC_SPMU_SB_DBG_CAM1_UNCACHEABLE}, + + /* TPC additional cache (IDs 77-80) */ + {"TPC_NOC_2_SB_BP", "Back pressure from NOC to TPC suspension buffer on read requests", GAUDI2_ENGINE_TPC, TPC_SPMU_NOC_2_SB_BP}, + {"TPC_DCACHE_HW_PREF", "Hardware prefetch execution event", GAUDI2_ENGINE_TPC, TPC_SPMU_DCACHE_HW_PREF}, + {"TPC_DCACHE_UC", "Uncached execution event", GAUDI2_ENGINE_TPC, TPC_SPMU_DCACHE_UC}, + {"TPC_DCACHE_DEALIGN", "Unaligned execution event", GAUDI2_ENGINE_TPC, TPC_SPMU_DCACHE_DEALIGN}, + + /* ===== EDMA SPMU events (50 events) ===== */ + + /* EDMA QMAN (IDs 0-9) */ + {"EDMA_QMAN0_PQ_BUF_PEND", "PQ (upper level) buffer QMAN0 is empty", GAUDI2_ENGINE_EDMA, EDMA_SPMU_QMAN0_PQ_BUF_PEND_CNT_EN}, + {"EDMA_QMAN1_PQ_BUF_PEND", "PQ (upper level) buffer QMAN1 is empty", GAUDI2_ENGINE_EDMA, EDMA_SPMU_QMAN1_PQ_BUF_PEND_CNT_EN}, + {"EDMA_QMAN2_PQ_BUF_PEND", "PQ (upper level) buffer QMAN2 is empty", GAUDI2_ENGINE_EDMA, EDMA_SPMU_QMAN2_PQ_BUF_PEND_CNT_EN}, + {"EDMA_QMAN3_PQ_BUF_PEND", "PQ (upper level) buffer QMAN3 is empty", GAUDI2_ENGINE_EDMA, EDMA_SPMU_QMAN3_PQ_BUF_PEND_CNT_EN}, + {"EDMA_QMAN0_CQ_BUF_PEND", "CQ (completion) buffer QMAN0 is empty", GAUDI2_ENGINE_EDMA, EDMA_SPMU_QMAN0_CQ_BUF_PEND_CNT_EN}, + {"EDMA_QMAN1_CQ_BUF_PEND", "CQ (completion) buffer QMAN1 is empty", GAUDI2_ENGINE_EDMA, EDMA_SPMU_QMAN1_CQ_BUF_PEND_CNT_EN}, + {"EDMA_QMAN2_CQ_BUF_PEND", "CQ (completion) buffer QMAN2 is empty", GAUDI2_ENGINE_EDMA, EDMA_SPMU_QMAN2_CQ_BUF_PEND_CNT_EN}, + {"EDMA_QMAN3_CQ_BUF_PEND", "CQ (completion) buffer QMAN3 is empty", GAUDI2_ENGINE_EDMA, EDMA_SPMU_QMAN3_CQ_BUF_PEND_CNT_EN}, + {"EDMA_QMAN_CMDQ_CQ_BUF_PEND", "Command queue CQ buffer pending", GAUDI2_ENGINE_EDMA, EDMA_SPMU_QMAN_CMDQ_CQ_BUF_PEND_CNT_EN}, + {"EDMA_QMAN_CMDQ_ARC_CQ_BUF_PEND", "ARC command queue CQ buffer pending", GAUDI2_ENGINE_EDMA, EDMA_SPMU_QMAN_CMDQ_ARC_CQ_BUF_PEND_CNT_EN}, + + /* EDMA errors and trace (IDs 10-17) */ + {"EDMA_AXI_HBW_ERR", "AXI high bandwidth error", GAUDI2_ENGINE_EDMA, EDMA_SPMU_AXI_HBW_ERR}, + {"EDMA_AXI_LBW_ERR", "AXI low bandwidth error", GAUDI2_ENGINE_EDMA, EDMA_SPMU_AXI_LBW_ERR}, + {"EDMA_TRACE_FENCE_START", "EDMA fence start", GAUDI2_ENGINE_EDMA, EDMA_SPMU_TRACE_FENCE_START}, + {"EDMA_TRACE_FENCE_DONE", "EDMA fence done", GAUDI2_ENGINE_EDMA, EDMA_SPMU_TRACE_FENCE_DONE}, + {"EDMA_TRACE_CP_SW_STOP", "CP software stop trace event", GAUDI2_ENGINE_EDMA, EDMA_SPMU_TRACE_CP_SW_STOP}, + {"EDMA_CP_ERR", "Command processor error", GAUDI2_ENGINE_EDMA, EDMA_SPMU_CP_ERR}, + {"EDMA_ARB_ERR", "Arbiter error", GAUDI2_ENGINE_EDMA, EDMA_SPMU_ARB_ERR}, + {"EDMA_DESC_PUSH", "EDMA descriptor push", GAUDI2_ENGINE_EDMA, EDMA_SPMU_TRACE_CHOICE_WIN_PUSH}, + + /* EDMA DMA trace (IDs 18-27) */ + {"EDMA_DMA_TRC_DESC_PUSH", "DMA trace descriptor push", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_TRC_DESC_PUSH}, + {"EDMA_DMA_TRC_CPL_MSG_SENT", "DMA trace completion message sent", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_TRC_CPL_MSG_SENT}, + {"EDMA_DMA_TRC_RD_FRST_ADDR_PUSH", "DMA trace read first address push", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_TRC_RD_FRST_ADDR_PUSH}, + {"EDMA_DMA_TRC_RD_LAST_ADDR_PUSH", "DMA trace read last address push", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_TRC_RD_LAST_ADDR_PUSH}, + {"EDMA_DMA_TRC_WR_FRST_ADDR_PUSH", "DMA trace write first address push", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_TRC_WR_FRST_ADDR_PUSH}, + {"EDMA_DMA_TRC_WR_LAST_ADDR_PUSH", "DMA trace write last address push", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_TRC_WR_LAST_ADDR_PUSH}, + {"EDMA_DMA_TRC_RD_DATA_FRST", "DMA trace read data first", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_TRC_RD_DATA_FRST}, + {"EDMA_DMA_TRC_RD_DATA_LAST", "DMA trace read data last", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_TRC_RD_DATA_LAST}, + {"EDMA_DMA_TRC_WR_DATA_FRST", "DMA trace write data first", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_TRC_WR_DATA_FRST}, + {"EDMA_DMA_TRC_WR_DATA_LAST", "DMA trace write data last", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_TRC_WR_DATA_LAST}, + + /* EDMA backpressure and context (IDs 28-36) */ + {"EDMA_MESH2SB_BP", "Mesh to SB back pressure", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_SPMU_MESH2SB_BP}, + {"EDMA_SB2MESH_BP", "SB to mesh back pressure", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_SPMU_SB2MESH_BP}, + {"EDMA_MESH2WB_BP", "Mesh to WB back pressure", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_SPMU_MESH2WB_BP}, + {"EDMA_RD_CTX_END2START", "Read context end to start", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_SPMU_RD_CTX_END2START}, + {"EDMA_WR_CTX_END2START", "Write context end to start", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_SPMU_WR_CTX_END2START}, + {"EDMA_SB2AGU_BP", "SB to AGU back pressure", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_SPMU_SB2AGU_BP}, + {"EDMA_SB_FULL_BP", "SB full back pressure", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_SPMU_SB_FULL_BP}, + {"EDMA_WB2AGU_BP", "WB to AGU back pressure", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_SPMU_WB2AGU_BP}, + {"EDMA_WB2GSKT_BP", "WB to gasket back pressure", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_SPMU_WB2GSKT_BP}, + + /* EDMA SB monitor counters (IDs 37-40) */ + {"EDMA_SB_MON_CNT_0", "SB monitor counter 0", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_0}, + {"EDMA_SB_MON_CNT_1", "SB monitor counter 1", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_1}, + {"EDMA_SB_MON_CNT_2", "SB monitor counter 2", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_2}, + {"EDMA_SB_MON_CNT_3", "SB monitor counter 3", GAUDI2_ENGINE_EDMA, EDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_3}, + + /* EDMA SB and CAM (IDs 41-49) */ + {"EDMA_SB_2_INITIATOR_BP_SB_FULL", "SB to initiator back pressure due to SB full", GAUDI2_ENGINE_EDMA, EDMA_SPMU_SB_2_INITIATOR_BP_SB_FULL}, + {"EDMA_SB_2_INITIATOR_BP", "SB to initiator back pressure", GAUDI2_ENGINE_EDMA, EDMA_SPMU_SB_2_INITIATOR_BP}, + {"EDMA_SB_DBG_CAM0_MISS", "SB debug CAM0 miss", GAUDI2_ENGINE_EDMA, EDMA_SPMU_SB_DBG_CAM0_MISS}, + {"EDMA_SB_DBG_CAM0_HIT", "SB debug CAM0 hit", GAUDI2_ENGINE_EDMA, EDMA_SPMU_SB_DBG_CAM0_HIT}, + {"EDMA_SB_DBG_CAM0_UNCACHEABLE", "SB debug CAM0 uncacheable", GAUDI2_ENGINE_EDMA, EDMA_SPMU_SB_DBG_CAM0_UNCACHEABLE}, + {"EDMA_SB_DBG_CAM1_MISS", "SB debug CAM1 miss", GAUDI2_ENGINE_EDMA, EDMA_SPMU_SB_DBG_CAM1_MISS}, + {"EDMA_SB_DBG_CAM1_HIT", "SB debug CAM1 hit", GAUDI2_ENGINE_EDMA, EDMA_SPMU_SB_DBG_CAM1_HIT}, + {"EDMA_SB_DBG_CAM1_UNCACHEABLE", "SB debug CAM1 uncacheable", GAUDI2_ENGINE_EDMA, EDMA_SPMU_SB_DBG_CAM1_UNCACHEABLE}, + {"EDMA_SB_AXI_NOC_2_SB_BP", "AXI NOC to SB back pressure", GAUDI2_ENGINE_EDMA, EDMA_SPMU_SB_AXI_NOC_2_SB_BP}, + + /* ===== MME CTRL SPMU events (8 events) ===== */ + {"MME_CONV_END_STALL_DIAG", "Convolution end stall on diagonal", GAUDI2_ENGINE_MME, MME_CTRL_SPMU_CONV_END_STALL_DIAG}, + {"MME_CONV_END_STALL_ACC", "Convolution end stall on accumulator", GAUDI2_ENGINE_MME, MME_CTRL_SPMU_CONV_END_STALL_ACC}, + {"MME_CONV_END_STALL_DIAG_STALL_ACC", "Convolution end stall on both diagonal and accumulator", GAUDI2_ENGINE_MME, MME_CTRL_SPMU_CONV_END_STALL_DIAG_STALL_ACC}, + {"MME_STALL_ON_B", "MME outer product stall waiting for B matrix", GAUDI2_ENGINE_MME, MME_CTRL_SPMU_OUTER_PRODUCT_STALL_ON_B}, + {"MME_STALL_ON_A", "MME outer product stall waiting for A matrix", GAUDI2_ENGINE_MME, MME_CTRL_SPMU_OUTER_PRODUCT_STALL_ON_A}, + {"MME_NUM_OUTER_PRODUCTS", "MME number of outer products", GAUDI2_ENGINE_MME, MME_CTRL_SPMU_NUM_OUTER_PRODUCTS}, + {"MME_QM_PREFETCH_BUFFER_EMPTY", "MME QM prefetch buffer empty", GAUDI2_ENGINE_MME, MME_CTRL_SPMU_QM_PREFETCH_BUFFER_EMPTY}, + {"MME_PQ_NOT_EMPTY_BUT_CQ_EMPTY", "MME PQ not empty but CQ empty", GAUDI2_ENGINE_MME, MME_CTRL_SPMU_PQ_NOT_EMPTY_BUT_CQ_EMPTY}, + + /* ===== PDMA SPMU events (50 events) ===== */ + + /* PDMA QMAN (IDs 0-9) */ + {"PDMA_QMAN0_PQ_BUF_PEND", "PQ (upper level) buffer QMAN0 is empty", GAUDI2_ENGINE_PDMA, PDMA_SPMU_QMAN0_PQ_BUF_PEND_CNT_EN}, + {"PDMA_QMAN1_PQ_BUF_PEND", "PQ (upper level) buffer QMAN1 is empty", GAUDI2_ENGINE_PDMA, PDMA_SPMU_QMAN1_PQ_BUF_PEND_CNT_EN}, + {"PDMA_QMAN2_PQ_BUF_PEND", "PQ (upper level) buffer QMAN2 is empty", GAUDI2_ENGINE_PDMA, PDMA_SPMU_QMAN2_PQ_BUF_PEND_CNT_EN}, + {"PDMA_QMAN3_PQ_BUF_PEND", "PQ (upper level) buffer QMAN3 is empty", GAUDI2_ENGINE_PDMA, PDMA_SPMU_QMAN3_PQ_BUF_PEND_CNT_EN}, + {"PDMA_QMAN0_CQ_BUF_PEND", "CQ (completion) buffer QMAN0 is empty", GAUDI2_ENGINE_PDMA, PDMA_SPMU_QMAN0_CQ_BUF_PEND_CNT_EN}, + {"PDMA_QMAN1_CQ_BUF_PEND", "CQ (completion) buffer QMAN1 is empty", GAUDI2_ENGINE_PDMA, PDMA_SPMU_QMAN1_CQ_BUF_PEND_CNT_EN}, + {"PDMA_QMAN2_CQ_BUF_PEND", "CQ (completion) buffer QMAN2 is empty", GAUDI2_ENGINE_PDMA, PDMA_SPMU_QMAN2_CQ_BUF_PEND_CNT_EN}, + {"PDMA_QMAN3_CQ_BUF_PEND", "CQ (completion) buffer QMAN3 is empty", GAUDI2_ENGINE_PDMA, PDMA_SPMU_QMAN3_CQ_BUF_PEND_CNT_EN}, + {"PDMA_QMAN_CMDQ_CQ_BUF_PEND", "Command queue CQ buffer pending", GAUDI2_ENGINE_PDMA, PDMA_SPMU_QMAN_CMDQ_CQ_BUF_PEND_CNT_EN}, + {"PDMA_QMAN_CMDQ_ARC_CQ_BUF_PEND", "ARC command queue CQ buffer pending", GAUDI2_ENGINE_PDMA, PDMA_SPMU_QMAN_CMDQ_ARC_CQ_BUF_PEND_CNT_EN}, + + /* PDMA errors and trace (IDs 10-17) */ + {"PDMA_AXI_HBW_ERR", "AXI high bandwidth error", GAUDI2_ENGINE_PDMA, PDMA_SPMU_AXI_HBW_ERR}, + {"PDMA_AXI_LBW_ERR", "AXI low bandwidth error", GAUDI2_ENGINE_PDMA, PDMA_SPMU_AXI_LBW_ERR}, + {"PDMA_TRACE_FENCE_START", "PDMA fence start", GAUDI2_ENGINE_PDMA, PDMA_SPMU_TRACE_FENCE_START}, + {"PDMA_TRACE_FENCE_DONE", "PDMA fence done", GAUDI2_ENGINE_PDMA, PDMA_SPMU_TRACE_FENCE_DONE}, + {"PDMA_TRACE_CP_SW_STOP", "CP software stop trace event", GAUDI2_ENGINE_PDMA, PDMA_SPMU_TRACE_CP_SW_STOP}, + {"PDMA_CP_ERR", "Command processor error", GAUDI2_ENGINE_PDMA, PDMA_SPMU_CP_ERR}, + {"PDMA_ARB_ERR", "Arbiter error", GAUDI2_ENGINE_PDMA, PDMA_SPMU_ARB_ERR}, + {"PDMA_DESC_PUSH", "PDMA descriptor push", GAUDI2_ENGINE_PDMA, PDMA_SPMU_TRACE_CHOICE_WIN_PUSH}, + + /* PDMA DMA trace (IDs 18-27) */ + {"PDMA_DMA_TRC_DESC_PUSH", "DMA trace descriptor push", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_TRC_DESC_PUSH}, + {"PDMA_DMA_TRC_CPL_MSG_SENT", "DMA trace completion message sent", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_TRC_CPL_MSG_SENT}, + {"PDMA_DMA_TRC_RD_FRST_ADDR_PUSH", "DMA trace read first address push", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_TRC_RD_FRST_ADDR_PUSH}, + {"PDMA_DMA_TRC_RD_LAST_ADDR_PUSH", "DMA trace read last address push", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_TRC_RD_LAST_ADDR_PUSH}, + {"PDMA_DMA_TRC_WR_FRST_ADDR_PUSH", "DMA trace write first address push", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_TRC_WR_FRST_ADDR_PUSH}, + {"PDMA_DMA_TRC_WR_LAST_ADDR_PUSH", "DMA trace write last address push", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_TRC_WR_LAST_ADDR_PUSH}, + {"PDMA_DMA_TRC_RD_DATA_FRST", "DMA trace read data first", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_TRC_RD_DATA_FRST}, + {"PDMA_DMA_TRC_RD_DATA_LAST", "DMA trace read data last", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_TRC_RD_DATA_LAST}, + {"PDMA_DMA_TRC_WR_DATA_FRST", "DMA trace write data first", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_TRC_WR_DATA_FRST}, + {"PDMA_DMA_TRC_WR_DATA_LAST", "DMA trace write data last", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_TRC_WR_DATA_LAST}, + + /* PDMA backpressure and context (IDs 28-36) */ + {"PDMA_MESH2SB_BP", "Mesh to SB back pressure", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_SPMU_MESH2SB_BP}, + {"PDMA_SB2MESH_BP", "SB to mesh back pressure", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_SPMU_SB2MESH_BP}, + {"PDMA_MESH2WB_BP", "Mesh to WB back pressure", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_SPMU_MESH2WB_BP}, + {"PDMA_RD_CTX_END2START", "Read context end to start", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_SPMU_RD_CTX_END2START}, + {"PDMA_WR_CTX_END2START", "Write context end to start", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_SPMU_WR_CTX_END2START}, + {"PDMA_SB2AGU_BP", "SB to AGU back pressure", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_SPMU_SB2AGU_BP}, + {"PDMA_SB_FULL_BP", "SB full back pressure", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_SPMU_SB_FULL_BP}, + {"PDMA_WB2AGU_BP", "WB to AGU back pressure", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_SPMU_WB2AGU_BP}, + {"PDMA_WB2GSKT_BP", "WB to gasket back pressure", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_SPMU_WB2GSKT_BP}, + + /* PDMA SB monitor counters (IDs 37-40) */ + {"PDMA_SB_MON_CNT_0", "SB monitor counter 0", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_0}, + {"PDMA_SB_MON_CNT_1", "SB monitor counter 1", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_1}, + {"PDMA_SB_MON_CNT_2", "SB monitor counter 2", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_2}, + {"PDMA_SB_MON_CNT_3", "SB monitor counter 3", GAUDI2_ENGINE_PDMA, PDMA_SPMU_DBG_DMA_SPMU_SB_MON_CNT_3}, + + /* PDMA SB and CAM (IDs 41-49) */ + {"PDMA_SB_2_INITIATOR_BP_SB_FULL", "SB to initiator back pressure due to SB full", GAUDI2_ENGINE_PDMA, PDMA_SPMU_SB_2_INITIATOR_BP_SB_FULL}, + {"PDMA_SB_2_INITIATOR_BP", "SB to initiator back pressure", GAUDI2_ENGINE_PDMA, PDMA_SPMU_SB_2_INITIATOR_BP}, + {"PDMA_SB_DBG_CAM0_MISS", "SB debug CAM0 miss", GAUDI2_ENGINE_PDMA, PDMA_SPMU_SB_DBG_CAM0_MISS}, + {"PDMA_SB_DBG_CAM0_HIT", "SB debug CAM0 hit", GAUDI2_ENGINE_PDMA, PDMA_SPMU_SB_DBG_CAM0_HIT}, + {"PDMA_SB_DBG_CAM0_UNCACHEABLE", "SB debug CAM0 uncacheable", GAUDI2_ENGINE_PDMA, PDMA_SPMU_SB_DBG_CAM0_UNCACHEABLE}, + {"PDMA_SB_DBG_CAM1_MISS", "SB debug CAM1 miss", GAUDI2_ENGINE_PDMA, PDMA_SPMU_SB_DBG_CAM1_MISS}, + {"PDMA_SB_DBG_CAM1_HIT", "SB debug CAM1 hit", GAUDI2_ENGINE_PDMA, PDMA_SPMU_SB_DBG_CAM1_HIT}, + {"PDMA_SB_DBG_CAM1_UNCACHEABLE", "SB debug CAM1 uncacheable", GAUDI2_ENGINE_PDMA, PDMA_SPMU_SB_DBG_CAM1_UNCACHEABLE}, + {"PDMA_SB_AXI_NOC_2_SB_BP", "AXI NOC to SB back pressure", GAUDI2_ENGINE_PDMA, PDMA_SPMU_SB_AXI_NOC_2_SB_BP}, + + {NULL, NULL, 0, 0} +}; + +/* Number of base events in catalog (computed at init) */ +static int gaudi2_num_catalog_events = 0; + +/* Per-event tracking for an eventset */ +typedef struct { + unsigned int event_code; /* Encoded event code (device + index) */ + int device_idx; /* Device index */ + int catalog_idx; /* Catalog event index */ + unsigned int counter_idx; /* Counter slot (0-5 per SPMU) */ + uint64_t spmu_base; /* SPMU base address */ + long long last_value; + long long accumulated; +} gaudi2_counter_t; + +/* Per-device tracking within an eventset */ +typedef struct { + int device_idx; + int num_events; /* Events for this device */ + int event_indices[GAUDI2_MAX_COUNTERS]; /* Indices into counters[] */ + int debug_mode_enabled; + int spmu_enabled; +} gaudi2_device_ctl_t; + +/* Per-eventset state */ +typedef struct { + gaudi2_counter_t counters[GAUDI2_MAX_COUNTERS]; + int num_counters; + long long values[GAUDI2_MAX_COUNTERS]; + int running; + /* Per-device control within this eventset */ + gaudi2_device_ctl_t device_ctl[GAUDI2_MAX_DEVICES]; + uint32_t active_device_mask; /* Bitmap of devices with events */ + int num_active_devices; +} gaudi2_control_t; + +/* Per-thread context - tracks debug mode per device */ +typedef struct { + int debug_mode_enabled[GAUDI2_MAX_DEVICES]; +} gaudi2_context_t; +static unsigned int gaudi2_lock; + +papi_vector_t _gaudi2_vector; + +/* Load hlthunk library */ +static int load_hlthunk_library(void) +{ + char root_lib_path[PAPI_HUGE_STR_LEN]; + const char *gaudi2_root; + int strLen; + + gaudi2_root = getenv("PAPI_GAUDI2_ROOT"); + + if (gaudi2_root != NULL) { + strLen = snprintf(root_lib_path, sizeof(root_lib_path), + "%s/lib/habanalabs/libhl-thunk.so", gaudi2_root); + if (strLen > 0 && strLen < (int)sizeof(root_lib_path)) { + hlthunk_handle = dlopen(root_lib_path, RTLD_NOW | RTLD_GLOBAL); + if (hlthunk_handle) { + SUBDBG("Loaded libhl-thunk.so from PAPI_GAUDI2_ROOT: %s\n", root_lib_path); + } + } + } + + /* Fallback */ + if (!hlthunk_handle) { + const char *fallback_paths[] = { + "/usr/lib/habanalabs/libhl-thunk.so", + "libhl-thunk.so", + NULL + }; + + for (int i = 0; fallback_paths[i] != NULL; i++) { + hlthunk_handle = dlopen(fallback_paths[i], RTLD_NOW | RTLD_GLOBAL); + if (hlthunk_handle) { + SUBDBG("Loaded libhl-thunk.so from fallback: %s\n", fallback_paths[i]); + break; + } + } + } + + if (!hlthunk_handle) { + SUBDBG("Failed to load libhl-thunk.so: %s\n", dlerror()); + return PAPI_ENOSUPP; + } + + p_hlthunk_open = (hlthunk_open_fn)dlsym(hlthunk_handle, "hlthunk_open"); + p_hlthunk_close = (hlthunk_close_fn)dlsym(hlthunk_handle, "hlthunk_close"); + p_hlthunk_debug = (hlthunk_debug_fn)dlsym(hlthunk_handle, "hlthunk_debug"); + p_hlthunk_get_device_name_from_fd = (hlthunk_get_device_name_from_fd_fn) + dlsym(hlthunk_handle, "hlthunk_get_device_name_from_fd"); + p_hlthunk_get_hw_ip_info = (hlthunk_get_hw_ip_info_fn) + dlsym(hlthunk_handle, "hlthunk_get_hw_ip_info"); + p_hlthunk_get_device_count = (hlthunk_get_device_count_fn) + dlsym(hlthunk_handle, "hlthunk_get_device_count"); + + if (!p_hlthunk_open || !p_hlthunk_close || !p_hlthunk_debug || + !p_hlthunk_get_device_name_from_fd || !p_hlthunk_get_hw_ip_info || + !p_hlthunk_get_device_count) { + SUBDBG("Failed to find required hlthunk symbols\n"); + dlclose(hlthunk_handle); + hlthunk_handle = NULL; + return PAPI_ENOSUPP; + } + + return PAPI_OK; +} + +/* Open a device by minor number */ +static int open_device_by_minor(int minor, int node_type) +{ + char path[64]; + const char *fmt; + int strLen; + + if (node_type == HLTHUNK_NODE_PRIMARY) + fmt = "/dev/accel/accel%d"; + else + fmt = "/dev/accel/accel_controlD%d"; + + strLen = snprintf(path, sizeof(path), fmt, minor); + if (strLen < 0 || strLen >= (int)sizeof(path)) + return -1; + return open(path, O_RDWR | O_CLOEXEC, 0); +} + +/* Check if a device type is a Gaudi2 variant */ +static int is_gaudi2_device(int device_type) +{ + return (device_type == HLTHUNK_DEVICE_GAUDI2 || + device_type == HLTHUNK_DEVICE_GAUDI2B || + device_type == HLTHUNK_DEVICE_GAUDI2C || + device_type == HLTHUNK_DEVICE_GAUDI2D); +} + +/** + * Find an existing fd for a specific device minor number from /proc/self/fd. + * This allows reusing the runtime's (e.g. PyTorch's) device context so that + * SPMU counters see the workload activity on that context. + * Returns fd number if found, or -1 if no existing fd matches. + */ +static int find_existing_fd_for_minor(int minor) +{ + DIR *dir; + struct dirent *entry; + char link_path[PAPI_MIN_STR_LEN]; + char target[PAPI_HUGE_STR_LEN]; + char expected[64]; + ssize_t len; + int found_fd = -1; + int strLen; + int status; + + strLen = snprintf(expected, sizeof(expected), "/dev/accel/accel%d", minor); + if (strLen < 0 || strLen >= (int)sizeof(expected)) + return -1; + + dir = opendir("/proc/self/fd"); + if (!dir) { + SUBDBG("Failed to open /proc/self/fd: %s\n", strerror(errno)); + return -1; + } + + while ((entry = readdir(dir)) != NULL) { + if (entry->d_name[0] == '.') + continue; + + strLen = snprintf(link_path, sizeof(link_path), "/proc/self/fd/%s", entry->d_name); + if (strLen < 0 || strLen >= (int)sizeof(link_path)) { + SUBDBG("snprintf overflow for /proc/self/fd/%s\n", entry->d_name); + continue; + } + + len = readlink(link_path, target, sizeof(target) - 1); + if (len < 0) + continue; + target[len] = '\0'; + + if (strcmp(target, expected) == 0) { + found_fd = atoi(entry->d_name); + SUBDBG("Found existing fd for minor %d: fd=%d -> %s\n", + minor, found_fd, target); + break; + } + } + + status = closedir(dir); + if (status == -1) { + SUBDBG("closedir failed for /proc/self/fd: %s\n", strerror(errno)); + } + + return found_fd; +} + +static int enable_debug_mode(int fd) +{ + struct hl_debug_args debug; + + memset(&debug, 0, sizeof(debug)); + debug.op = HL_DEBUG_OP_SET_MODE; + debug.enable = 1; + + if (p_hlthunk_debug(fd, &debug) < 0) { + SUBDBG("Failed to enable debug mode on fd=%d\n", fd); + return PAPI_ESYS; + } + + return PAPI_OK; +} + +static int disable_debug_mode(int fd) +{ + struct hl_debug_args debug; + + memset(&debug, 0, sizeof(debug)); + debug.op = HL_DEBUG_OP_SET_MODE; + debug.enable = 0; + + p_hlthunk_debug(fd, &debug); + return PAPI_OK; +} + +static int enable_spmu(int fd, int reg_idx, uint64_t *events, int num_events) +{ + struct hl_debug_params_spmu params; + struct hl_debug_args debug; + + memset(¶ms, 0, sizeof(params)); + for (int i = 0; i < num_events && i < HL_DEBUG_MAX_AUX_VALUES; i++) { + params.event_types[i] = events[i]; + } + params.event_types_num = num_events; + + memset(&debug, 0, sizeof(debug)); + debug.op = HL_DEBUG_OP_SPMU; + debug.reg_idx = reg_idx; + debug.enable = 1; + debug.input_ptr = (uint64_t)¶ms; + debug.input_size = sizeof(params); + + if (p_hlthunk_debug(fd, &debug) < 0) { + SUBDBG("Failed to enable SPMU on fd=%d reg_idx=%d\n", fd, reg_idx); + return PAPI_ESYS; + } + + return PAPI_OK; +} + +static int disable_spmu(int fd, int reg_idx) +{ + struct hl_debug_args debug; + + memset(&debug, 0, sizeof(debug)); + debug.op = HL_DEBUG_OP_SPMU; + debug.reg_idx = reg_idx; + debug.enable = 0; + + p_hlthunk_debug(fd, &debug); + return PAPI_OK; +} + +/* Read SPMU counters via READBLOCK */ +static int read_spmu_counters(int fd, uint64_t base_addr, int num_counters, long long *values) +{ + struct hl_debug_params_read_block params; + struct hl_debug_args debug; + void *read_buffer; + int papi_errno = PAPI_OK; + int i; + + for (i = 0; i < num_counters; i++) + values[i] = 0; + + read_buffer = mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (read_buffer == MAP_FAILED) { + SUBDBG("mmap failed for SPMU read buffer\n"); + papi_errno = PAPI_ENOMEM; + goto cleanup; + } + + memset(read_buffer, 0, 4096); + + memset(¶ms, 0, sizeof(params)); + params.cfg_address = base_addr; + params.user_address = (uint64_t)read_buffer; + params.size = 256; + params.flags = 0; + + memset(&debug, 0, sizeof(debug)); + debug.op = HL_DEBUG_OP_READBLOCK; + debug.input_ptr = (uint64_t)¶ms; + debug.input_size = sizeof(params); + + if (p_hlthunk_debug(fd, &debug) < 0) { + SUBDBG("READBLOCK failed for base_addr=0x%llx\n", (unsigned long long)base_addr); + papi_errno = PAPI_ESYS; + goto cleanup; + } + + /* Extract lower 32 bits of each 64-bit counter */ + uint32_t *counter_data = (uint32_t *)read_buffer; + for (i = 0; i < num_counters; i++) + values[i] = (long long)counter_data[i * 2]; + +cleanup: + if (read_buffer != MAP_FAILED) + munmap(read_buffer, 4096); + return papi_errno; +} + +static uint64_t get_spmu_base_address(gaudi2_engine_type_t engine, int dcore, int instance) +{ + static const uint64_t tpc_spmu_bases[GAUDI2_NUM_DCORES][GAUDI2_TPC_PER_DCORE] = { + {GAUDI2_DCORE0_TPC0_SPMU_BASE, GAUDI2_DCORE0_TPC1_SPMU_BASE, + GAUDI2_DCORE0_TPC2_SPMU_BASE, GAUDI2_DCORE0_TPC3_SPMU_BASE, + GAUDI2_DCORE0_TPC4_SPMU_BASE, GAUDI2_DCORE0_TPC5_SPMU_BASE}, + {GAUDI2_DCORE1_TPC0_SPMU_BASE, GAUDI2_DCORE1_TPC1_SPMU_BASE, + GAUDI2_DCORE1_TPC2_SPMU_BASE, GAUDI2_DCORE1_TPC3_SPMU_BASE, + GAUDI2_DCORE1_TPC4_SPMU_BASE, GAUDI2_DCORE1_TPC5_SPMU_BASE}, + {GAUDI2_DCORE2_TPC0_SPMU_BASE, GAUDI2_DCORE2_TPC1_SPMU_BASE, + GAUDI2_DCORE2_TPC2_SPMU_BASE, GAUDI2_DCORE2_TPC3_SPMU_BASE, + GAUDI2_DCORE2_TPC4_SPMU_BASE, GAUDI2_DCORE2_TPC5_SPMU_BASE}, + {GAUDI2_DCORE3_TPC0_SPMU_BASE, GAUDI2_DCORE3_TPC1_SPMU_BASE, + GAUDI2_DCORE3_TPC2_SPMU_BASE, GAUDI2_DCORE3_TPC3_SPMU_BASE, + GAUDI2_DCORE3_TPC4_SPMU_BASE, GAUDI2_DCORE3_TPC5_SPMU_BASE} + }; + + static const uint64_t edma_spmu_bases[GAUDI2_NUM_DCORES][GAUDI2_EDMA_PER_DCORE] = { + {GAUDI2_DCORE0_EDMA0_SPMU_BASE, GAUDI2_DCORE0_EDMA1_SPMU_BASE}, + {GAUDI2_DCORE1_EDMA0_SPMU_BASE, GAUDI2_DCORE1_EDMA1_SPMU_BASE}, + {GAUDI2_DCORE2_EDMA0_SPMU_BASE, GAUDI2_DCORE2_EDMA1_SPMU_BASE}, + {GAUDI2_DCORE3_EDMA0_SPMU_BASE, GAUDI2_DCORE3_EDMA1_SPMU_BASE} + }; + + switch (engine) { + case GAUDI2_ENGINE_TPC: + if (dcore < GAUDI2_NUM_DCORES && instance < GAUDI2_TPC_PER_DCORE) { + return tpc_spmu_bases[dcore][instance]; + } + break; + case GAUDI2_ENGINE_EDMA: + if (dcore < GAUDI2_NUM_DCORES && instance < GAUDI2_EDMA_PER_DCORE) { + return edma_spmu_bases[dcore][instance]; + } + break; + case GAUDI2_ENGINE_PDMA: + if (instance == 0) return GAUDI2_PDMA0_SPMU_BASE; + if (instance == 1) return GAUDI2_PDMA1_SPMU_BASE; + break; + default: + break; + } + + return GAUDI2_DCORE0_TPC0_SPMU_BASE; +} + +/* + * PAPI component interface + */ + +/* Enumerate all Gaudi2 devices and populate device table */ +static int enumerate_gaudi2_devices(void) +{ + int minor, ctrl_fd, dev_fd, device_type; + int num_found = 0; + + /* First pass: count Gaudi2 devices */ + for (minor = 0; minor < HLTHUNK_MAX_MINOR && num_found < GAUDI2_MAX_DEVICES; minor++) { + ctrl_fd = open_device_by_minor(minor, HLTHUNK_NODE_CONTROL); + if (ctrl_fd < 0) + continue; + + device_type = p_hlthunk_get_device_name_from_fd(ctrl_fd); + close(ctrl_fd); + + if (is_gaudi2_device(device_type)) + num_found++; + } + + if (num_found == 0) { + SUBDBG("No Gaudi2 devices found\n"); + return 0; + } + + /* Allocate device table */ + gaudi2_devices = (gaudi2_device_t *)papi_calloc(num_found, sizeof(gaudi2_device_t)); + if (!gaudi2_devices) { + SUBDBG("Failed to allocate device table\n"); + return -1; + } + + /* Second pass: populate device table */ + gaudi2_num_devices = 0; + for (minor = 0; minor < HLTHUNK_MAX_MINOR && gaudi2_num_devices < num_found; minor++) { + ctrl_fd = open_device_by_minor(minor, HLTHUNK_NODE_CONTROL); + if (ctrl_fd < 0) + continue; + + device_type = p_hlthunk_get_device_name_from_fd(ctrl_fd); + if (!is_gaudi2_device(device_type)) { + close(ctrl_fd); + continue; + } + + close(ctrl_fd); + + /* Try to find an existing fd from the runtime (e.g. PyTorch) first. + * This ensures SPMU counters see the workload on that context. */ + dev_fd = find_existing_fd_for_minor(minor); + int borrowed = (dev_fd >= 0); + + if (dev_fd < 0) { + /* No existing fd found, open new */ + dev_fd = open_device_by_minor(minor, HLTHUNK_NODE_PRIMARY); + } + + if (dev_fd < 0) { + SUBDBG("Failed to open primary device for minor %d\n", minor); + continue; + } + + gaudi2_device_t *dev = &gaudi2_devices[gaudi2_num_devices]; + dev->device_idx = gaudi2_num_devices; + dev->device_fd = dev_fd; + dev->owns_fd = borrowed ? 0 : 1; + dev->device_type = device_type; + + /* Query hardware IP info */ + memset(&dev->hw_ip, 0, sizeof(dev->hw_ip)); + if (p_hlthunk_get_hw_ip_info(dev_fd, &dev->hw_ip) != 0) { + SUBDBG("Failed to get hw_ip_info for device %d\n", gaudi2_num_devices); + close(dev_fd); + continue; + } + + /* Determine engine availability */ + dev->tpc_avail = (dev->hw_ip.tpc_enabled_mask_ext != 0); + dev->edma_avail = (dev->hw_ip.edma_enabled_mask != 0); + dev->mme_avail = 1; /* Always present on Gaudi2 */ + dev->pdma_avail = 1; /* Always present on Gaudi2 */ + + SUBDBG("Device %d: fd=%d(%s) type=%d TPC=%d EDMA=%d MME=%d PDMA=%d\n", + gaudi2_num_devices, dev_fd, dev->owns_fd ? "owned" : "borrowed", + device_type, dev->tpc_avail, dev->edma_avail, dev->mme_avail, + dev->pdma_avail); + + gaudi2_num_devices++; + } + + return gaudi2_num_devices; +} + +/* Check if an event is available on a specific device */ +static int event_available_on_device(gaudi2_native_event_t *event, gaudi2_device_t *dev) +{ + switch (event->engine) { + case GAUDI2_ENGINE_TPC: return dev->tpc_avail; + case GAUDI2_ENGINE_EDMA: return dev->edma_avail; + case GAUDI2_ENGINE_MME: return dev->mme_avail; + case GAUDI2_ENGINE_PDMA: return dev->pdma_avail; + default: return 0; + } +} + +static int gaudi2_init_component(int cidx) +{ + int papi_errno = PAPI_OK; + + SUBDBG("Initializing Gaudi2 component (cidx=%d)\n", cidx); + + _gaudi2_vector.cmp_info.CmpIdx = cidx; + + /* Load hlthunk library */ + papi_errno = load_hlthunk_library(); + if (papi_errno != PAPI_OK) { + int strLen = snprintf(_gaudi2_vector.cmp_info.disabled_reason, + PAPI_HUGE_STR_LEN, "Failed to load libhl-thunk.so"); + if (strLen < 0 || strLen >= PAPI_HUGE_STR_LEN) + _gaudi2_vector.cmp_info.disabled_reason[0] = '\0'; + _gaudi2_vector.cmp_info.disabled = papi_errno; + return papi_errno; + } + + /* Enumerate all Gaudi2 devices */ + int num_devices = enumerate_gaudi2_devices(); + if (num_devices <= 0) { + papi_errno = PAPI_ENOSUPP; + int strLen = snprintf(_gaudi2_vector.cmp_info.disabled_reason, + PAPI_HUGE_STR_LEN, "No Gaudi2 devices found"); + if (strLen < 0 || strLen >= PAPI_HUGE_STR_LEN) + _gaudi2_vector.cmp_info.disabled_reason[0] = '\0'; + _gaudi2_vector.cmp_info.disabled = papi_errno; + return papi_errno; + } + + SUBDBG("Found %d Gaudi2 device(s)\n", num_devices); + + /* Count catalog events */ + gaudi2_num_catalog_events = 0; + while (gaudi2_event_catalog[gaudi2_num_catalog_events].name != NULL) + gaudi2_num_catalog_events++; + + if (gaudi2_num_catalog_events == 0) { + papi_errno = PAPI_ENOSUPP; + SUBDBG("No events in catalog\n"); + int strLen = snprintf(_gaudi2_vector.cmp_info.disabled_reason, + PAPI_HUGE_STR_LEN, "No events defined in catalog"); + if (strLen < 0 || strLen >= PAPI_HUGE_STR_LEN) + _gaudi2_vector.cmp_info.disabled_reason[0] = '\0'; + _gaudi2_vector.cmp_info.disabled = papi_errno; + return papi_errno; + } + + SUBDBG("Catalog has %d base events, %d devices available\n", + gaudi2_num_catalog_events, gaudi2_num_devices); + + /* Device qualifiers are enumerated via PAPI_NTV_ENUM_UMASKS */ + _gaudi2_vector.cmp_info.num_native_events = gaudi2_num_catalog_events; + _gaudi2_vector.cmp_info.num_cntrs = GAUDI2_MAX_SPMU_COUNTERS; + _gaudi2_vector.cmp_info.num_mpx_cntrs = GAUDI2_MAX_COUNTERS; + + gaudi2_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cidx; + _gaudi2_vector.cmp_info.initialized = 1; + + return PAPI_OK; +} + +static int gaudi2_shutdown_component(void) +{ + int d; + + /* Close device file descriptors that were opened (not borrowed from runtime) */ + if (gaudi2_devices) { + for (d = 0; d < gaudi2_num_devices; d++) { + if (gaudi2_devices[d].device_fd >= 0 && gaudi2_devices[d].owns_fd) { + close(gaudi2_devices[d].device_fd); + gaudi2_devices[d].device_fd = -1; + } + } + papi_free(gaudi2_devices); + gaudi2_devices = NULL; + } + gaudi2_num_devices = 0; + gaudi2_num_catalog_events = 0; + + if (hlthunk_handle) { + dlclose(hlthunk_handle); + hlthunk_handle = NULL; + } + + _gaudi2_vector.cmp_info.initialized = 0; + return PAPI_OK; +} + +static int gaudi2_init_thread(hwd_context_t *ctx) +{ + gaudi2_context_t *gaudi2_ctx = (gaudi2_context_t *)ctx; + + memset(gaudi2_ctx, 0, sizeof(gaudi2_context_t)); + return PAPI_OK; +} + +static int gaudi2_shutdown_thread(hwd_context_t *ctx) +{ + gaudi2_context_t *gaudi2_ctx = (gaudi2_context_t *)ctx; + int d; + + /* Disable debug mode on all devices that were enabled */ + for (d = 0; d < gaudi2_num_devices; d++) { + if (gaudi2_ctx->debug_mode_enabled[d] && gaudi2_devices[d].device_fd >= 0) { + disable_debug_mode(gaudi2_devices[d].device_fd); + gaudi2_ctx->debug_mode_enabled[d] = 0; + } + } + + return PAPI_OK; +} + +static int gaudi2_init_control_state(hwd_control_state_t *ctl) +{ + memset(ctl, 0, sizeof(gaudi2_control_t)); + return PAPI_OK; +} + +static int gaudi2_cleanup_eventset(hwd_control_state_t *ctl) +{ + memset(ctl, 0, sizeof(gaudi2_control_t)); + return PAPI_OK; +} + +static int gaudi2_update_control_state(hwd_control_state_t *ctl, + NativeInfo_t *native, + int count, + hwd_context_t *ctx) +{ + gaudi2_control_t *gaudi2_ctl = (gaudi2_control_t *)ctl; + gaudi2_event_info_t evt_info; + int i, d, papi_errno; + (void)ctx; + + if (count > GAUDI2_MAX_COUNTERS) { + SUBDBG("Event count %d exceeds max %d\n", count, GAUDI2_MAX_COUNTERS); + return PAPI_ECOUNT; + } + + /* Reset control state */ + memset(gaudi2_ctl->device_ctl, 0, sizeof(gaudi2_ctl->device_ctl)); + gaudi2_ctl->active_device_mask = 0; + gaudi2_ctl->num_active_devices = 0; + gaudi2_ctl->num_counters = count; + + /* Process each event and organize by device */ + for (i = 0; i < count; i++) { + unsigned int event_code = native[i].ni_event; + + /* Decode event code to get nameid, device, and flags */ + papi_errno = gaudi2_evt_id_to_info(event_code, &evt_info); + if (papi_errno != PAPI_OK) { + SUBDBG("Failed to decode event code %u\n", event_code); + return papi_errno; + } + + if (evt_info.nameid < 0 || evt_info.nameid >= gaudi2_num_catalog_events) { + SUBDBG("Invalid nameid %d (max %d)\n", evt_info.nameid, gaudi2_num_catalog_events); + return PAPI_EINVAL; + } + + if (evt_info.device < 0 || evt_info.device >= gaudi2_num_devices) { + SUBDBG("Invalid device %d (max %d)\n", evt_info.device, gaudi2_num_devices); + return PAPI_EINVAL; + } + + gaudi2_native_event_t *cat_evt = &gaudi2_event_catalog[evt_info.nameid]; + + /* Set up counter tracking */ + gaudi2_ctl->counters[i].event_code = event_code; + gaudi2_ctl->counters[i].device_idx = evt_info.device; + gaudi2_ctl->counters[i].catalog_idx = evt_info.nameid; + gaudi2_ctl->counters[i].spmu_base = get_spmu_base_address(cat_evt->engine, 0, 0); + gaudi2_ctl->counters[i].last_value = 0; + gaudi2_ctl->counters[i].accumulated = 0; + + /* Track this device */ + if (!(gaudi2_ctl->active_device_mask & (1 << evt_info.device))) { + gaudi2_ctl->active_device_mask |= (1 << evt_info.device); + gaudi2_ctl->device_ctl[evt_info.device].device_idx = evt_info.device; + } + + /* Add event to device's event list */ + gaudi2_device_ctl_t *dev_ctl = &gaudi2_ctl->device_ctl[evt_info.device]; + dev_ctl->event_indices[dev_ctl->num_events] = i; + gaudi2_ctl->counters[i].counter_idx = dev_ctl->num_events % GAUDI2_MAX_SPMU_COUNTERS; + dev_ctl->num_events++; + + native[i].ni_position = i; + } + + /* Count active devices */ + for (d = 0; d < gaudi2_num_devices; d++) { + if (gaudi2_ctl->active_device_mask & (1 << d)) + gaudi2_ctl->num_active_devices++; + } + + SUBDBG("Configured %d events across %d devices (mask=0x%x)\n", + count, gaudi2_ctl->num_active_devices, gaudi2_ctl->active_device_mask); + + return PAPI_OK; +} + +static int gaudi2_start(hwd_context_t *ctx, hwd_control_state_t *ctl) +{ + gaudi2_context_t *gaudi2_ctx = (gaudi2_context_t *)ctx; + gaudi2_control_t *gaudi2_ctl = (gaudi2_control_t *)ctl; + int d, i, papi_errno; + + /* For each device with events, enable debug mode and SPMU */ + for (d = 0; d < gaudi2_num_devices; d++) { + if (!(gaudi2_ctl->active_device_mask & (1 << d))) + continue; + + gaudi2_device_ctl_t *dev_ctl = &gaudi2_ctl->device_ctl[d]; + int dev_fd = gaudi2_devices[d].device_fd; + + /* Enable debug mode on this device if not already enabled */ + if (!gaudi2_ctx->debug_mode_enabled[d]) { + papi_errno = enable_debug_mode(dev_fd); + if (papi_errno != PAPI_OK) { + SUBDBG("Failed to enable debug mode on device %d\n", d); + return papi_errno; + } + gaudi2_ctx->debug_mode_enabled[d] = 1; + } + + /* Build event array for this device */ + uint64_t events[HL_DEBUG_MAX_AUX_VALUES]; + int num_dev_events = dev_ctl->num_events; + if (num_dev_events > HL_DEBUG_MAX_AUX_VALUES) + num_dev_events = HL_DEBUG_MAX_AUX_VALUES; + + for (i = 0; i < num_dev_events; i++) { + int counter_idx = dev_ctl->event_indices[i]; + int catalog_idx = gaudi2_ctl->counters[counter_idx].catalog_idx; + events[i] = gaudi2_event_catalog[catalog_idx].event_id; + } + + /* Enable SPMU on this device */ + papi_errno = enable_spmu(dev_fd, 0, events, num_dev_events); + if (papi_errno != PAPI_OK) { + SUBDBG("Failed to enable SPMU on device %d\n", d); + return papi_errno; + } + dev_ctl->spmu_enabled = 1; + + SUBDBG("Started %d events on device %d\n", num_dev_events, d); + } + + /* Reset all counter values */ + for (i = 0; i < gaudi2_ctl->num_counters; i++) { + gaudi2_ctl->counters[i].last_value = 0; + gaudi2_ctl->counters[i].accumulated = 0; + } + + gaudi2_ctl->running = GAUDI2_EVENTS_RUNNING; + return PAPI_OK; +} + +static int gaudi2_stop(hwd_context_t *ctx, hwd_control_state_t *ctl) +{ + gaudi2_context_t *gaudi2_ctx = (gaudi2_context_t *)ctx; + gaudi2_control_t *gaudi2_ctl = (gaudi2_control_t *)ctl; + int d, i; + (void)gaudi2_ctx; + + if (gaudi2_ctl->running == GAUDI2_EVENTS_RUNNING) { + /* Read final values from each device */ + for (d = 0; d < gaudi2_num_devices; d++) { + if (!(gaudi2_ctl->active_device_mask & (1 << d))) + continue; + + gaudi2_device_ctl_t *dev_ctl = &gaudi2_ctl->device_ctl[d]; + int dev_fd = gaudi2_devices[d].device_fd; + + /* Read counters for this device */ + long long temp_values[GAUDI2_MAX_SPMU_COUNTERS]; + int num_dev_events = dev_ctl->num_events; + if (num_dev_events > GAUDI2_MAX_SPMU_COUNTERS) + num_dev_events = GAUDI2_MAX_SPMU_COUNTERS; + + /* Get SPMU base for first event on this device */ + int first_counter_idx = dev_ctl->event_indices[0]; + uint64_t base = gaudi2_ctl->counters[first_counter_idx].spmu_base; + + if (read_spmu_counters(dev_fd, base, num_dev_events, temp_values) == PAPI_OK) { + for (i = 0; i < num_dev_events; i++) { + int counter_idx = dev_ctl->event_indices[i]; + gaudi2_ctl->counters[counter_idx].accumulated += temp_values[i]; + gaudi2_ctl->values[counter_idx] = gaudi2_ctl->counters[counter_idx].accumulated; + } + } + + /* Disable SPMU on this device */ + if (dev_ctl->spmu_enabled) { + disable_spmu(dev_fd, 0); + dev_ctl->spmu_enabled = 0; + } + } + } + + gaudi2_ctl->running = GAUDI2_EVENTS_STOPPED; + return PAPI_OK; +} + +static int gaudi2_read(hwd_context_t *ctx, hwd_control_state_t *ctl, + long long **events, int flags) +{ + gaudi2_context_t *gaudi2_ctx = (gaudi2_context_t *)ctx; + gaudi2_control_t *gaudi2_ctl = (gaudi2_control_t *)ctl; + int d, i; + (void)gaudi2_ctx; + (void)flags; + + if (gaudi2_ctl->running == GAUDI2_EVENTS_RUNNING) { + /* Read current values from each device */ + for (d = 0; d < gaudi2_num_devices; d++) { + if (!(gaudi2_ctl->active_device_mask & (1 << d))) + continue; + + gaudi2_device_ctl_t *dev_ctl = &gaudi2_ctl->device_ctl[d]; + int dev_fd = gaudi2_devices[d].device_fd; + + long long temp_values[GAUDI2_MAX_SPMU_COUNTERS]; + int num_dev_events = dev_ctl->num_events; + if (num_dev_events > GAUDI2_MAX_SPMU_COUNTERS) + num_dev_events = GAUDI2_MAX_SPMU_COUNTERS; + + int first_counter_idx = dev_ctl->event_indices[0]; + uint64_t base = gaudi2_ctl->counters[first_counter_idx].spmu_base; + + if (read_spmu_counters(dev_fd, base, num_dev_events, temp_values) == PAPI_OK) { + for (i = 0; i < num_dev_events; i++) { + int counter_idx = dev_ctl->event_indices[i]; + gaudi2_ctl->values[counter_idx] = + gaudi2_ctl->counters[counter_idx].accumulated + temp_values[i]; + } + } + } + } + + *events = gaudi2_ctl->values; + return PAPI_OK; +} + +static int gaudi2_reset(hwd_context_t *ctx, hwd_control_state_t *ctl) +{ + gaudi2_control_t *gaudi2_ctl = (gaudi2_control_t *)ctl; + (void)ctx; + + for (int i = 0; i < gaudi2_ctl->num_counters; i++) { + gaudi2_ctl->counters[i].last_value = 0; + gaudi2_ctl->counters[i].accumulated = 0; + gaudi2_ctl->values[i] = 0; + } + + return PAPI_OK; +} + +static int gaudi2_ntv_enum_events(unsigned int *EventCode, int modifier) +{ + gaudi2_event_info_t info; + int papi_errno = PAPI_OK; + + switch (modifier) { + case PAPI_ENUM_FIRST: + /* Return first base event */ + if (gaudi2_num_catalog_events == 0) + return PAPI_ENOEVNT; + info.nameid = 0; + info.device = 0; + info.flags = 0; + papi_errno = gaudi2_evt_id_create(&info, EventCode); + return papi_errno; + + case PAPI_ENUM_EVENTS: + /* Iterate through base events only */ + papi_errno = gaudi2_evt_id_to_info(*EventCode, &info); + if (papi_errno != PAPI_OK) + return papi_errno; + if (info.nameid + 1 < gaudi2_num_catalog_events) { + info.nameid++; + info.device = 0; + info.flags = 0; + return gaudi2_evt_id_create(&info, EventCode); + } + return PAPI_ENOEVNT; + + case PAPI_NTV_ENUM_UMASKS: + /* Enumerate device qualifier */ + papi_errno = gaudi2_evt_id_to_info(*EventCode, &info); + if (papi_errno != PAPI_OK) + return papi_errno; + /* If flags=0 (base event), return device qualifier entry */ + if (info.flags == 0) { + info.device = 0; + info.flags = GAUDI2_DEVICE_FLAG; + return gaudi2_evt_id_create(&info, EventCode); + } + /* Only one qualifier (device) */ + return PAPI_ENOEVNT; + + default: + return PAPI_EINVAL; + } +} + +static int gaudi2_ntv_code_to_name(unsigned int EventCode, char *name, int len) +{ + gaudi2_event_info_t info; + int papi_errno; + + papi_errno = gaudi2_evt_id_to_info(EventCode, &info); + if (papi_errno != PAPI_OK) + return papi_errno; + + if (info.nameid < 0 || info.nameid >= gaudi2_num_catalog_events) { + SUBDBG("nameid %d out of range (max %d)\n", info.nameid, gaudi2_num_catalog_events); + return PAPI_ENOEVNT; + } + + int strLen; + + switch (info.flags) { + case GAUDI2_DEVICE_FLAG: + /* Event with device qualifier */ + strLen = snprintf(name, len, "%s:device=%d", + gaudi2_event_catalog[info.nameid].name, info.device); + break; + default: + /* Base event (flags=0) */ + strLen = snprintf(name, len, "%s", gaudi2_event_catalog[info.nameid].name); + break; + } + + if (strLen < 0 || strLen >= len) + return PAPI_EINVAL; + + return PAPI_OK; +} + +/* Parse event name and convert to event code */ +static int gaudi2_ntv_name_to_code(const char *name, unsigned int *EventCode) +{ + char base_name[PAPI_HUGE_STR_LEN]; + gaudi2_event_info_t info; + const char *device_ptr; + int i; + + /* Copy name to extract base */ + strncpy(base_name, name, sizeof(base_name) - 1); + base_name[sizeof(base_name) - 1] = '\0'; + + /* Default device and flags */ + info.device = 0; + info.flags = GAUDI2_DEVICE_FLAG; + + device_ptr = strstr(name, ":device="); + if (device_ptr != NULL) { + info.device = atoi(device_ptr + 8); + base_name[device_ptr - name] = '\0'; + } + + /* Validate device index */ + if (info.device < 0 || info.device >= gaudi2_num_devices) { + SUBDBG("Invalid device %d in event name '%s' (max %d)\n", + info.device, name, gaudi2_num_devices - 1); + return PAPI_ENOEVNT; + } + + /* Find base event in catalog */ + info.nameid = -1; + for (i = 0; i < gaudi2_num_catalog_events; i++) { + if (strcmp(base_name, gaudi2_event_catalog[i].name) == 0) { + info.nameid = i; + break; + } + } + + if (info.nameid < 0) { + SUBDBG("Event '%s' (base='%s') not found in catalog\n", name, base_name); + return PAPI_ENOEVNT; + } + + /* Check if event is available on specified device */ + if (!event_available_on_device(&gaudi2_event_catalog[info.nameid], + &gaudi2_devices[info.device])) { + SUBDBG("Event '%s' not available on device %d\n", base_name, info.device); + return PAPI_ENOEVNT; + } + + return gaudi2_evt_id_create(&info, EventCode); +} + +static int gaudi2_ntv_code_to_descr(unsigned int EventCode, char *descr, int len) +{ + gaudi2_event_info_t info; + int papi_errno; + + papi_errno = gaudi2_evt_id_to_info(EventCode, &info); + if (papi_errno != PAPI_OK) + return papi_errno; + + if (info.nameid < 0 || info.nameid >= gaudi2_num_catalog_events) { + SUBDBG("nameid %d out of range (max %d)\n", info.nameid, gaudi2_num_catalog_events); + return PAPI_ENOEVNT; + } + + int strLen = snprintf(descr, len, "%s", gaudi2_event_catalog[info.nameid].description); + if (strLen < 0 || strLen >= len) + return PAPI_EINVAL; + return PAPI_OK; +} + +static int gaudi2_ntv_code_to_info(unsigned int EventCode, PAPI_event_info_t *info) +{ + gaudi2_event_info_t evt_info; + char devices[PAPI_HUGE_STR_LEN]; + int papi_errno; + int d, first_avail_device = 0; + int strLen; + size_t offset; + + papi_errno = gaudi2_evt_id_to_info(EventCode, &evt_info); + if (papi_errno != PAPI_OK) + return papi_errno; + + if (evt_info.nameid < 0 || evt_info.nameid >= gaudi2_num_catalog_events) { + SUBDBG("nameid %d out of range (max %d)\n", evt_info.nameid, gaudi2_num_catalog_events); + return PAPI_ENOEVNT; + } + + gaudi2_native_event_t *cat_evt = &gaudi2_event_catalog[evt_info.nameid]; + + devices[0] = '\0'; + offset = 0; + for (d = 0; d < gaudi2_num_devices; d++) { + if (event_available_on_device(cat_evt, &gaudi2_devices[d])) { + if (offset == 0) { + first_avail_device = d; + } + if (offset > 0) { + strLen = snprintf(devices + offset, sizeof(devices) - offset, ","); + if (strLen < 0 || strLen >= (int)(sizeof(devices) - offset)) + return PAPI_EINVAL; + offset += strLen; + } + strLen = snprintf(devices + offset, sizeof(devices) - offset, "%d", d); + if (strLen < 0 || strLen >= (int)(sizeof(devices) - offset)) + return PAPI_EINVAL; + offset += strLen; + } + } + + switch (evt_info.flags) { + case GAUDI2_DEVICE_FLAG: + /* Device qualifier entry - shown when enumerating UMASKS */ + strLen = snprintf(info->symbol, sizeof(info->symbol), + "%s:device=%d", cat_evt->name, first_avail_device); + if (strLen < 0 || strLen >= (int)sizeof(info->symbol)) + return PAPI_EINVAL; + strLen = snprintf(info->long_descr, sizeof(info->long_descr), + "%s masks:Mandatory device qualifier [%s]", + cat_evt->description, devices); + if (strLen < 0 || strLen >= (int)sizeof(info->long_descr)) + return PAPI_EINVAL; + break; + + default: + /* Base event (flags=0) - shown when enumerating events */ + strLen = snprintf(info->symbol, sizeof(info->symbol), + "%s", cat_evt->name); + if (strLen < 0 || strLen >= (int)sizeof(info->symbol)) + return PAPI_EINVAL; + strLen = snprintf(info->long_descr, sizeof(info->long_descr), + "%s", cat_evt->description); + if (strLen < 0 || strLen >= (int)sizeof(info->long_descr)) + return PAPI_EINVAL; + break; + } + + strLen = snprintf(info->short_descr, sizeof(info->short_descr), + "%s", cat_evt->description); + if (strLen < 0 || strLen >= (int)sizeof(info->short_descr)) + return PAPI_EINVAL; + info->event_code = EventCode; + info->component_index = _gaudi2_vector.cmp_info.CmpIdx; + + return PAPI_OK; +} + +static int gaudi2_set_domain(hwd_control_state_t *ctl, int domain) +{ + (void)ctl; + (void)domain; + return PAPI_OK; +} + +static int gaudi2_ctl(hwd_context_t *ctx, int code, _papi_int_option_t *option) +{ + (void)ctx; + (void)code; + (void)option; + return PAPI_OK; +} + +/* PAPI vector table */ +papi_vector_t _gaudi2_vector = { + .cmp_info = { + .name = "gaudi2", + .short_name = "gaudi2", + .version = "1.0", + .description = "Intel Gaudi2 AI Accelerator hardware counters", + .num_mpx_cntrs = GAUDI2_MAX_COUNTERS, + .num_cntrs = GAUDI2_MAX_SPMU_COUNTERS, + .default_domain = PAPI_DOM_USER, + .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL, + .default_granularity = PAPI_GRN_THR, + .available_granularities = PAPI_GRN_THR, + .hardware_intr_sig = PAPI_INT_SIGNAL, + .fast_real_timer = 0, + .fast_virtual_timer = 0, + .attach = 0, + .attach_must_ptrace = 0, + }, + + .size = { + .context = sizeof(gaudi2_context_t), + .control_state = sizeof(gaudi2_control_t), + .reg_value = 1, + .reg_alloc = 1, + }, + + .init_component = gaudi2_init_component, + .init_thread = gaudi2_init_thread, + .init_control_state = gaudi2_init_control_state, + .shutdown_component = gaudi2_shutdown_component, + .shutdown_thread = gaudi2_shutdown_thread, + .cleanup_eventset = gaudi2_cleanup_eventset, + + .update_control_state = gaudi2_update_control_state, + .start = gaudi2_start, + .stop = gaudi2_stop, + .read = gaudi2_read, + .reset = gaudi2_reset, + + .ntv_enum_events = gaudi2_ntv_enum_events, + .ntv_code_to_name = gaudi2_ntv_code_to_name, + .ntv_name_to_code = gaudi2_ntv_name_to_code, + .ntv_code_to_descr = gaudi2_ntv_code_to_descr, + .ntv_code_to_info = gaudi2_ntv_code_to_info, + + .set_domain = gaudi2_set_domain, + .ctl = gaudi2_ctl, +}; diff --git a/src/components/gaudi2/tests/Makefile b/src/components/gaudi2/tests/Makefile new file mode 100644 index 000000000..9ecb816c0 --- /dev/null +++ b/src/components/gaudi2/tests/Makefile @@ -0,0 +1,10 @@ +NAME=gaudi2 +include ../../Makefile_comp_tests.target + +# Gaudi2 component tests are Python scripts in python/ subdirectory. +# No C/C++ compilation needed. +gaudi2_tests: + @echo "Gaudi2 tests are Python scripts. Run: cd python && bash run_tests.sh" + +clean: + @echo "Nothing to clean for gaudi2 Python tests" diff --git a/src/components/gaudi2/tests/python/run_tests.sh b/src/components/gaudi2/tests/python/run_tests.sh new file mode 100755 index 000000000..6e821743f --- /dev/null +++ b/src/components/gaudi2/tests/python/run_tests.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# +# run_tests.sh +# +# Test runner for PAPI Gaudi2 component Python tests. +# Sets up the environment and runs each test script. +# +# Usage: +# PT_HPU_LAZY_MODE=1 bash run_tests.sh +# +# Environment variables: +# PAPI_DIR - Path to PAPI install directory (auto-detected if not set) +# + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Auto-detect PAPI install directory +if [ -z "$PAPI_DIR" ]; then + PAPI_DIR="$(cd "$SCRIPT_DIR/../../../../install" 2>/dev/null && pwd)" +fi + +if [ ! -d "$PAPI_DIR" ]; then + echo "ERROR: PAPI install directory not found: $PAPI_DIR" + echo "Set PAPI_DIR environment variable or build PAPI first." + exit 1 +fi + +echo "Using PAPI_DIR: $PAPI_DIR" + +# Setup environment +export LD_LIBRARY_PATH="$PAPI_DIR/lib:${LD_LIBRARY_PATH:-}" +export PATH="$PAPI_DIR/bin:$PATH" + +# Test list +TESTS=( + "test_component_and_events.py" + "test_start_stop_read.py" + "test_multidevice.py" +) + +# Run tests +TOTAL=0 +PASSED=0 +FAILED=0 + +for test in "${TESTS[@]}"; do + TOTAL=$((TOTAL + 1)) + echo "Running: $test" + echo -e "-------------------------------------\n" + + python3 "$SCRIPT_DIR/$test" + rc=$? + + if [ $rc -eq 0 ]; then + PASSED=$((PASSED + 1)) + echo -e "Result: \e[32mPASSED\e[0m" + else + FAILED=$((FAILED + 1)) + echo -e "Result: \e[31mFAILED\e[0m (exit code $rc)" + fi + + echo -e "-------------------------------------\n" +done + +# Summary +echo "" +echo "OVERALL SUMMARY" +echo -e "-------------------------------------\n" +echo " Total: $TOTAL" +echo " Passed: $PASSED" +echo " Failed: $FAILED" +echo -e "-------------------------------------\n" + +if [ $FAILED -gt 0 ]; then + exit 1 +fi + +exit 0 diff --git a/src/components/gaudi2/tests/python/test_component_and_events.py b/src/components/gaudi2/tests/python/test_component_and_events.py new file mode 100644 index 000000000..a3fd6e8ff --- /dev/null +++ b/src/components/gaudi2/tests/python/test_component_and_events.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +""" +test_component_and_events.py + +Test script for PAPI Gaudi2 component availability and event enumeration. +Verifies that the gaudi2 component is loaded, enabled, and exposes the +expected native events using PAPI command-line utilities. + +Prerequisites: + - PAPI built and installed with gaudi2 component + - papi_component_avail and papi_native_avail in PATH + +Usage: + python3 test_component_and_events.py +""" + +import subprocess +import sys +import os +import shutil + +# Constants +# Known events that must be present in the gaudi2 event catalog +REQUIRED_EVENTS = [ + "TPC_KERNEL_EXECUTED", + "TPC_STALL", + "EDMA_DESC_PUSH", + "MME_NUM_OUTER_PRODUCTS", +] + +# Helper Functions + +def run_command(cmd): + """Run a command and return (returncode, stdout, stderr)""" + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=30, + ) + return result.returncode, result.stdout, result.stderr + except FileNotFoundError: + return -1, "", f"Command not found: {cmd[0]}" + except subprocess.TimeoutExpired: + return -1, "", f"Command timed out: {' '.join(cmd)}" + +# Test: Component Availability + +def test_component_avail(): + """Verify gaudi2 component is listed and not disabled""" + + if not shutil.which("papi_component_avail"): + print(" SKIP: papi_component_avail not found in PATH") + return None + + rc, stdout, stderr = run_command(["papi_component_avail"]) + if rc != 0: + print(f" FAILED: papi_component_avail returned {rc}") + if stderr: + print(f" stderr: {stderr.strip()}") + return False + + # Check that gaudi2 component appears in the output + found_gaudi2 = False + is_disabled = False + + for line in stdout.splitlines(): + if "gaudi2" in line.lower(): + found_gaudi2 = True + print(f" Found: {line.strip()}") + if "disabled" in line.lower(): + is_disabled = True + + if not found_gaudi2: + print(" FAILED: gaudi2 component not found in papi_component_avail output") + return False + + if is_disabled: + print(" FAILED: gaudi2 component is disabled") + return False + + print(" gaudi2 component is available and enabled") + return True + +# Test: Native Event Enumeration + +def test_native_events(): + """Verify gaudi2 native events are correctly enumerated""" + + if not shutil.which("papi_native_avail"): + print(" SKIP: papi_native_avail not found in PATH") + return None + + rc, stdout, stderr = run_command(["papi_native_avail"]) + if rc != 0: + print(f" FAILED: papi_native_avail returned {rc}") + if stderr: + print(f" stderr: {stderr.strip()}") + return False + + output_lines = stdout.splitlines() + + # Check for required events + print("\n Checking required events:") + all_found = True + for event_name in REQUIRED_EVENTS: + found = any(event_name in line for line in output_lines) + status = "FOUND" if found else "MISSING" + print(f" {event_name:<35} {status}") + if not found: + all_found = False + + if not all_found: + print("\n FAILED: Some required events are missing") + return False + + # Check that device qualifiers appear + print("\n Checking device qualifiers:") + has_device_qualifier = any(":device=" in line for line in output_lines) + if has_device_qualifier: + print(" :device= qualifier found") + else: + print(" WARNING: No :device= qualifiers found in output") + + # Count total events listed + event_count = 0 + for line in output_lines: + # prefix or event name pattern + stripped = line.strip() + if stripped and any(stripped.startswith(prefix) for prefix in + ["TPC_", "EDMA_", "MME_"]): + event_count += 1 + + print(f"\n Total event entries found: {event_count}") + + # contain event name | description) + has_descriptions = any("|" in line and len(line.split("|")) >= 2 + for line in output_lines + if any(ev in line for ev in REQUIRED_EVENTS)) + if has_descriptions: + print(" Event descriptions present") + + print("\n All required events found") + return True + +# Main + +def main(): + results = {} + + results["Component Availability"] = test_component_avail() + results["Native Event Enumeration"] = test_native_events() + + # Summary + passed = 0 + failed = 0 + skipped = 0 + + for name, result in results.items(): + if result is None: + status = "SKIPPED" + skipped += 1 + elif result: + status = "PASSED" + passed += 1 + else: + status = "FAILED" + failed += 1 + print(f" {name:<40} {status}") + + print(f"\n Total: {passed} passed, {failed} failed, {skipped} skipped") + + if failed > 0: + print("\nFAILED") + return 1 + + print("\nPASSED") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/components/gaudi2/tests/python/test_multidevice.py b/src/components/gaudi2/tests/python/test_multidevice.py new file mode 100644 index 000000000..2cc44fe3f --- /dev/null +++ b/src/components/gaudi2/tests/python/test_multidevice.py @@ -0,0 +1,572 @@ +#!/usr/bin/env python3 +""" +test_multidevice.py + +Test script for PAPI Gaudi2 component multi-device support. +Tests device qualifiers, monitoring the same event across multiple devices, +mixed events across devices, invalid device handling, and multiple reads +during workload execution. + +Prerequisites: + - PAPI built and installed with gaudi2 component + - libpapi.so in LD_LIBRARY_PATH + - PyTorch with Habana support + - One or more Gaudi2 devices available + +Usage: + PT_HPU_LAZY_MODE=1 python3 test_multidevice.py +""" + +import ctypes +import sys +import os + +# PAPI Constants + +PAPI_VER_CURRENT = 0x07030000 +PAPI_NULL = -1 +PAPI_OK = 0 + +# Load PAPI Library + +def load_papi(): + """Load PAPI shared library""" + lib_paths = [ + "libpapi.so", + ] + + for path in lib_paths: + try: + papi = ctypes.CDLL(path) + print(f" Loaded PAPI from: {path}") + return papi + except OSError: + continue + + print(" ERROR: Could not load libpapi.so") + print(" Make sure PAPI is built with gaudi2 component and in LD_LIBRARY_PATH") + return None + +# PAPI Wrapper + +class PAPIWrapper: + """Wrapper for PAPI library functions using ctypes""" + + def __init__(self, papi): + self.papi = papi + self._setup_functions() + + def _setup_functions(self): + """Setup ctypes function signatures""" + self.papi.PAPI_library_init.argtypes = [ctypes.c_int] + self.papi.PAPI_library_init.restype = ctypes.c_int + + self.papi.PAPI_create_eventset.argtypes = [ctypes.POINTER(ctypes.c_int)] + self.papi.PAPI_create_eventset.restype = ctypes.c_int + + self.papi.PAPI_add_named_event.argtypes = [ctypes.c_int, ctypes.c_char_p] + self.papi.PAPI_add_named_event.restype = ctypes.c_int + + self.papi.PAPI_start.argtypes = [ctypes.c_int] + self.papi.PAPI_start.restype = ctypes.c_int + + self.papi.PAPI_stop.argtypes = [ctypes.c_int, ctypes.POINTER(ctypes.c_longlong)] + self.papi.PAPI_stop.restype = ctypes.c_int + + self.papi.PAPI_read.argtypes = [ctypes.c_int, ctypes.POINTER(ctypes.c_longlong)] + self.papi.PAPI_read.restype = ctypes.c_int + + self.papi.PAPI_reset.argtypes = [ctypes.c_int] + self.papi.PAPI_reset.restype = ctypes.c_int + + self.papi.PAPI_cleanup_eventset.argtypes = [ctypes.c_int] + self.papi.PAPI_cleanup_eventset.restype = ctypes.c_int + + self.papi.PAPI_destroy_eventset.argtypes = [ctypes.POINTER(ctypes.c_int)] + self.papi.PAPI_destroy_eventset.restype = ctypes.c_int + + self.papi.PAPI_shutdown.argtypes = [] + self.papi.PAPI_shutdown.restype = None + + self.papi.PAPI_strerror.argtypes = [ctypes.c_int] + self.papi.PAPI_strerror.restype = ctypes.c_char_p + + def library_init(self, version=PAPI_VER_CURRENT): + """Initialize PAPI library""" + ret = self.papi.PAPI_library_init(version) + if ret != version: + if ret > 0: + print(f" PAPI version mismatch: got 0x{ret:08x}, expected 0x{version:08x}") + else: + print(f" PAPI init failed: {self.strerror(ret)}") + return False + return True + + def create_eventset(self): + """Create a new event set""" + eventset = ctypes.c_int(PAPI_NULL) + ret = self.papi.PAPI_create_eventset(ctypes.byref(eventset)) + if ret != PAPI_OK: + print(f" Failed to create eventset: {self.strerror(ret)}") + return None + return eventset.value + + def add_named_event(self, eventset, name): + """Add a named event to the event set""" + ret = self.papi.PAPI_add_named_event(eventset, name.encode()) + if ret != PAPI_OK: + print(f" Failed to add event '{name}': {self.strerror(ret)}") + return False + return True + + def start(self, eventset): + """Start counting""" + ret = self.papi.PAPI_start(eventset) + if ret != PAPI_OK: + print(f" Failed to start: {self.strerror(ret)}") + return False + return True + + def stop(self, eventset, num_events): + """Stop counting and return values""" + values = (ctypes.c_longlong * num_events)() + ret = self.papi.PAPI_stop(eventset, values) + if ret != PAPI_OK: + print(f" Failed to stop: {self.strerror(ret)}") + return None + return list(values) + + def read(self, eventset, num_events): + """Read current counter values""" + values = (ctypes.c_longlong * num_events)() + ret = self.papi.PAPI_read(eventset, values) + if ret != PAPI_OK: + print(f" Failed to read: {self.strerror(ret)}") + return None + return list(values) + + def reset(self, eventset): + """Reset counters""" + ret = self.papi.PAPI_reset(eventset) + return ret == PAPI_OK + + def cleanup_eventset(self, eventset): + """Cleanup event set""" + self.papi.PAPI_cleanup_eventset(eventset) + + def destroy_eventset(self, eventset): + """Destroy event set""" + es = ctypes.c_int(eventset) + self.papi.PAPI_destroy_eventset(ctypes.byref(es)) + + def shutdown(self): + """Shutdown PAPI""" + self.papi.PAPI_shutdown() + + def strerror(self, code): + """Get error string""" + msg = self.papi.PAPI_strerror(code) + return msg.decode() if msg else f"Unknown error {code}" + +# PyTorch Initialization and Workload + +_pytorch_initialized = False +_torch = None +_hthpu = None +_device = None + +def init_pytorch(): + """Initialize PyTorch and acquire the Gaudi2 device BEFORE PAPI init""" + global _pytorch_initialized, _torch, _hthpu, _device + + if _pytorch_initialized: + return True + + try: + import torch + import habana_frameworks.torch.hpu as hthpu + + _torch = torch + _hthpu = hthpu + + print(" Initializing PyTorch HPU...") + _device = torch.device("hpu") + + # Force device initialization by creating a small tensor + _ = torch.zeros(1, device=_device) + hthpu.synchronize() + + _pytorch_initialized = True + print(" PyTorch HPU initialized successfully") + return True + + except ImportError as e: + print(f" PyTorch/Habana not available: {e}") + return False + except Exception as e: + print(f" Failed to initialize PyTorch HPU: {e}") + return False + +def run_pytorch_workload(): + """Run a PyTorch matmul workload on Gaudi2""" + global _torch, _hthpu, _device + + if not _pytorch_initialized: + print(" ERROR: PyTorch not initialized") + return False + + try: + dtype = _torch.float32 + size = 1024 + a = _torch.randn(size, size, dtype=dtype, device=_device) + b = _torch.randn(size, size, dtype=dtype, device=_device) + + # Warm-up + for _ in range(3): + c = _torch.matmul(a, b) + _hthpu.synchronize() + + # Actual workload + for _ in range(10): + c = _torch.matmul(a, b) + _hthpu.synchronize() + + return True + + except Exception as e: + print(f" Workload failed: {e}") + return False + +# Helper Functions + +def detect_devices_by_probing(papi): + """Detect the number of Gaudi2 devices that PAPI can actually use by + attempting to add an event with increasing device indices. + Not all /dev/accel/accel* files are necessarily Gaudi2 devices, so + we probe PAPI directly to find the valid device count. + Returns (num_devices, device_ids) where device_ids is a 0-based list.""" + device_ids = [] + for d in range(16): + eventset = papi.create_eventset() + if eventset is None: + break + event_name = f"gaudi2:::TPC_KERNEL_EXECUTED:device={d}" + ret = papi.papi.PAPI_add_named_event(eventset, event_name.encode()) + if ret == PAPI_OK: + device_ids.append(d) + papi.cleanup_eventset(eventset) + papi.destroy_eventset(eventset) + if ret != PAPI_OK: + break + return len(device_ids), device_ids + +def cleanup(papi, eventset): + """Cleanup eventset and shutdown PAPI""" + if eventset is not None: + papi.cleanup_eventset(eventset) + papi.destroy_eventset(eventset) + +# Sub-test 1: Single event on a specific device + +def test_single_device_qualifier(papi, device_id): + """Test adding a single event with an explicit :device=N qualifier""" + print(f"Sub-test 1: Single event with :device={device_id} qualifier") + eventset = papi.create_eventset() + if eventset is None: + return False + + event_name = f"gaudi2:::TPC_KERNEL_EXECUTED:device={device_id}" + print(f" Adding event: {event_name}") + if not papi.add_named_event(eventset, event_name): + papi.destroy_eventset(eventset) + return False + print(f" Event added successfully") + + print(f" Starting counters...") + if not papi.start(eventset): + cleanup(papi, eventset) + return False + + print(f" Running workload...") + run_pytorch_workload() + + values = papi.stop(eventset, 1) + if values is None: + cleanup(papi, eventset) + return False + + print(f"\n Result:") + print(f" TPC_KERNEL_EXECUTED:device={device_id} = {values[0]:>15,}") + + cleanup(papi, eventset) + print(f"\n PASSED") + return True + +# Sub-test 2: Same event across multiple devices + +def test_same_event_multiple_devices(papi, device_ids): + """Test monitoring the same event across all available devices""" + print(f"Sub-test 2: Same event across {len(device_ids)} devices {device_ids}") + eventset = papi.create_eventset() + if eventset is None: + return False + + event_names = [] + for d in device_ids: + event_name = f"gaudi2:::TPC_KERNEL_EXECUTED:device={d}" + print(f" Adding: {event_name}") + if not papi.add_named_event(eventset, event_name): + cleanup(papi, eventset) + return False + event_names.append(event_name) + + num_events = len(event_names) + print(f"\n Starting counters on {num_events} devices...") + if not papi.start(eventset): + cleanup(papi, eventset) + return False + + print(f" Running workload...") + run_pytorch_workload() + + values = papi.stop(eventset, num_events) + if values is None: + cleanup(papi, eventset) + return False + + print(f"\n Results (TPC_KERNEL_EXECUTED per device):") + print(f" {'Device':<10} {'Value':>15}") + print(f" {'-'*10} {'-'*15}") + for i, d in enumerate(device_ids): + print(f" device={d:<5} {values[i]:>15,}") + + cleanup(papi, eventset) + print(f"\n PASSED") + return True + +# Sub-test 3: Different events across different devices + +def test_mixed_events_devices(papi, device_ids): + """Test monitoring different events on different devices""" + print(f"Sub-test 3: Mixed events across devices {device_ids}") + eventset = papi.create_eventset() + if eventset is None: + return False + + d0 = device_ids[0] + events = [ + (f"gaudi2:::TPC_KERNEL_EXECUTED:device={d0}", f"TPC_KERNEL_EXECUTED:device={d0}"), + (f"gaudi2:::TPC_STALL:device={d0}", f"TPC_STALL:device={d0}"), + (f"gaudi2:::MME_NUM_OUTER_PRODUCTS:device={d0}", f"MME_NUM_OUTER_PRODUCTS:device={d0}"), + ] + + if len(device_ids) > 1: + d1 = device_ids[1] + events.extend([ + (f"gaudi2:::TPC_KERNEL_EXECUTED:device={d1}", f"TPC_KERNEL_EXECUTED:device={d1}"), + (f"gaudi2:::TPC_STALL:device={d1}", f"TPC_STALL:device={d1}"), + (f"gaudi2:::MME_NUM_OUTER_PRODUCTS:device={d1}", f"MME_NUM_OUTER_PRODUCTS:device={d1}"), + ]) + + num_events = 0 + added_labels = [] + for full_name, label in events: + print(f" Adding: {full_name}") + if papi.add_named_event(eventset, full_name): + num_events += 1 + added_labels.append(label) + else: + print(f" (skipped)") + + if num_events == 0: + print("\n ERROR: No events could be added") + papi.destroy_eventset(eventset) + return False + + print(f"\n Starting counters ({num_events} events)...") + if not papi.start(eventset): + cleanup(papi, eventset) + return False + + print(f" Running workload...") + run_pytorch_workload() + + values = papi.stop(eventset, num_events) + if values is None: + cleanup(papi, eventset) + return False + + print(f"\n Results:") + print(f" {'Event':<40} {'Value':>15}") + print(f" {'-'*40} {'-'*15}") + for label, value in zip(added_labels, values): + print(f" {label:<40} {value:>15,}") + + cleanup(papi, eventset) + print(f"\n PASSED") + return True + +# Sub-test 4: Invalid device qualifier (negative test) + +def test_invalid_device(papi, num_devices): + """Test that adding an event with an invalid device index fails gracefully""" + print(f"Sub-test 4: Invalid device qualifier (negative test)") + eventset = papi.create_eventset() + if eventset is None: + return False + + invalid_device = num_devices + event_name = f"gaudi2:::TPC_KERNEL_EXECUTED:device={invalid_device}" + print(f" Adding event with invalid device: {event_name}") + + ret = papi.papi.PAPI_add_named_event(eventset, event_name.encode()) + if ret != PAPI_OK: + print(f" Correctly rejected: {papi.strerror(ret)}") + papi.destroy_eventset(eventset) + print(f"\n PASSED") + return True + else: + print(f" ERROR: Should have rejected device={invalid_device} but accepted it") + cleanup(papi, eventset) + return False + +# Sub-test 5: Multiple reads during workload + +def test_read_during_workload(papi, device_id): + """Test reading counter values during workload execution""" + print(f"Sub-test 5: Multiple reads during workload on device={device_id}") + eventset = papi.create_eventset() + if eventset is None: + return False + + events = [ + f"gaudi2:::TPC_KERNEL_EXECUTED:device={device_id}", + f"gaudi2:::TPC_VECTOR_PIPE_EXEC:device={device_id}", + ] + num_events = 0 + for event_name in events: + print(f" Adding: {event_name}") + if papi.add_named_event(eventset, event_name): + num_events += 1 + + if num_events == 0: + papi.destroy_eventset(eventset) + return False + + print(f"\n Starting counters...") + if not papi.start(eventset): + cleanup(papi, eventset) + return False + + # Read before workload + values_before = papi.read(eventset, num_events) + print(f" Before workload: {values_before}") + + # Run workload and read + print(f" Running workload...") + run_pytorch_workload() + + values_after = papi.read(eventset, num_events) + print(f" After workload: {values_after}") + + # Run another workload and stop + print(f" Running second workload...") + run_pytorch_workload() + + values_final = papi.stop(eventset, num_events) + print(f" Final (stop): {values_final}") + + if values_before and values_after and values_final: + print(f"\n Counter progression:") + print(f" {'Event':<30} {'Before':>12} {'After':>12} {'Final':>12}") + print(f" {'-'*30} {'-'*12} {'-'*12} {'-'*12}") + labels = [e.split(":::")[-1] for e in events[:num_events]] + for i, label in enumerate(labels): + print(f" {label:<30} {values_before[i]:>12,} {values_after[i]:>12,} {values_final[i]:>12,}") + + cleanup(papi, eventset) + print(f"\n PASSED") + return True + +# Main + +def main(): + # Initialize PyTorch FIRST (acquires device fd needed by PAPI) + print("\n[SETUP] Initializing PyTorch HPU...") + if not init_pytorch(): + print(" SKIP: PyTorch HPU not available") + return 0 + + # Load and initialize PAPI + print("\n[SETUP] Loading PAPI library...") + papi_lib = load_papi() + if not papi_lib: + print("\nFAILED") + return 1 + + papi = PAPIWrapper(papi_lib) + + print("\n[SETUP] Initializing PAPI...") + if not papi.library_init(): + print("\nFAILED") + return 1 + print(" PAPI initialized successfully") + + # Detect devices by probing PAPI (not all /dev/accel/* are Gaudi2) + num_devices, device_ids = detect_devices_by_probing(papi) + print(f"\n Detected {num_devices} Gaudi2 device(s) usable by PAPI") + print(f" PAPI device indices: {device_ids}") + if num_devices == 0: + print(" SKIP: No Gaudi2 devices found via PAPI") + papi.shutdown() + return 0 + + # Run sub-tests + results = {} + + results["Sub-test 1: Single device qualifier"] = test_single_device_qualifier(papi, 0) + + if num_devices > 1: + test_devs = device_ids[:4] # Cap at 4 devices + results["Sub-test 2: Same event multi-device"] = test_same_event_multiple_devices( + papi, test_devs + ) + else: + print(f"\n Skipping Sub-test 2 (need >1 device, have {num_devices})") + results["Sub-test 2: Same event multi-device"] = None + + results["Sub-test 3: Mixed events/devices"] = test_mixed_events_devices(papi, device_ids) + + results["Sub-test 4: Invalid device"] = test_invalid_device(papi, num_devices) + + results["Sub-test 5: Read during workload"] = test_read_during_workload(papi, 0) + + # Summary + passed = 0 + failed = 0 + skipped = 0 + for name, result in results.items(): + if result is None: + status = "SKIPPED" + skipped += 1 + elif result: + status = "PASSED" + passed += 1 + else: + status = "FAILED" + failed += 1 + print(f" {name:<45} {status}") + + print(f"\n Total: {passed} passed, {failed} failed, {skipped} skipped") + + papi.shutdown() + + if failed > 0: + print("\nFAILED") + return 1 + + print("\nPASSED") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/components/gaudi2/tests/python/test_start_stop_read.py b/src/components/gaudi2/tests/python/test_start_stop_read.py new file mode 100644 index 000000000..dac3d4397 --- /dev/null +++ b/src/components/gaudi2/tests/python/test_start_stop_read.py @@ -0,0 +1,352 @@ +#!/usr/bin/env python3 +""" +test_start_stop_read.py + +Test script for PAPI Gaudi2 component full measurement lifecycle. +Exercises the complete PAPI workflow: init, create eventset, add events, +start, run workload, stop, read values, cleanup, and shutdown on a +single Gaudi2 device. + +Prerequisites: + - PAPI built and installed with gaudi2 component + - libpapi.so in LD_LIBRARY_PATH + - PyTorch with Habana support + +Usage: + PT_HPU_LAZY_MODE=1 python3 test_start_stop_read.py +""" + +import ctypes +import sys +import os + +# PAPI Constants + +PAPI_VER_CURRENT = 0x07030000 +PAPI_NULL = -1 +PAPI_OK = 0 + +# Load PAPI Library + +def load_papi(): + """Load PAPI shared library""" + lib_paths = [ + "libpapi.so", + ] + + for path in lib_paths: + try: + papi = ctypes.CDLL(path) + print(f" Loaded PAPI from: {path}") + return papi + except OSError: + continue + + print(" ERROR: Could not load libpapi.so") + print(" Make sure PAPI is built with gaudi2 component and in LD_LIBRARY_PATH") + return None + +# PAPI Wrapper + +class PAPIWrapper: + """Wrapper for PAPI library functions using ctypes""" + + def __init__(self, papi): + self.papi = papi + self._setup_functions() + + def _setup_functions(self): + """Setup ctypes function signatures""" + self.papi.PAPI_library_init.argtypes = [ctypes.c_int] + self.papi.PAPI_library_init.restype = ctypes.c_int + + self.papi.PAPI_create_eventset.argtypes = [ctypes.POINTER(ctypes.c_int)] + self.papi.PAPI_create_eventset.restype = ctypes.c_int + + self.papi.PAPI_add_named_event.argtypes = [ctypes.c_int, ctypes.c_char_p] + self.papi.PAPI_add_named_event.restype = ctypes.c_int + + self.papi.PAPI_start.argtypes = [ctypes.c_int] + self.papi.PAPI_start.restype = ctypes.c_int + + self.papi.PAPI_stop.argtypes = [ctypes.c_int, ctypes.POINTER(ctypes.c_longlong)] + self.papi.PAPI_stop.restype = ctypes.c_int + + self.papi.PAPI_read.argtypes = [ctypes.c_int, ctypes.POINTER(ctypes.c_longlong)] + self.papi.PAPI_read.restype = ctypes.c_int + + self.papi.PAPI_reset.argtypes = [ctypes.c_int] + self.papi.PAPI_reset.restype = ctypes.c_int + + self.papi.PAPI_cleanup_eventset.argtypes = [ctypes.c_int] + self.papi.PAPI_cleanup_eventset.restype = ctypes.c_int + + self.papi.PAPI_destroy_eventset.argtypes = [ctypes.POINTER(ctypes.c_int)] + self.papi.PAPI_destroy_eventset.restype = ctypes.c_int + + self.papi.PAPI_shutdown.argtypes = [] + self.papi.PAPI_shutdown.restype = None + + self.papi.PAPI_strerror.argtypes = [ctypes.c_int] + self.papi.PAPI_strerror.restype = ctypes.c_char_p + + def library_init(self, version=PAPI_VER_CURRENT): + """Initialize PAPI library""" + ret = self.papi.PAPI_library_init(version) + if ret != version: + if ret > 0: + print(f" PAPI version mismatch: got 0x{ret:08x}, expected 0x{version:08x}") + else: + print(f" PAPI init failed: {self.strerror(ret)}") + return False + return True + + def create_eventset(self): + """Create a new event set""" + eventset = ctypes.c_int(PAPI_NULL) + ret = self.papi.PAPI_create_eventset(ctypes.byref(eventset)) + if ret != PAPI_OK: + print(f" Failed to create eventset: {self.strerror(ret)}") + return None + return eventset.value + + def add_named_event(self, eventset, name): + """Add a named event to the event set""" + ret = self.papi.PAPI_add_named_event(eventset, name.encode()) + if ret != PAPI_OK: + print(f" Failed to add event '{name}': {self.strerror(ret)}") + return False + return True + + def start(self, eventset): + """Start counting""" + ret = self.papi.PAPI_start(eventset) + if ret != PAPI_OK: + print(f" Failed to start: {self.strerror(ret)}") + return False + return True + + def stop(self, eventset, num_events): + """Stop counting and return values""" + values = (ctypes.c_longlong * num_events)() + ret = self.papi.PAPI_stop(eventset, values) + if ret != PAPI_OK: + print(f" Failed to stop: {self.strerror(ret)}") + return None + return list(values) + + def read(self, eventset, num_events): + """Read current counter values""" + values = (ctypes.c_longlong * num_events)() + ret = self.papi.PAPI_read(eventset, values) + if ret != PAPI_OK: + print(f" Failed to read: {self.strerror(ret)}") + return None + return list(values) + + def reset(self, eventset): + """Reset counters""" + ret = self.papi.PAPI_reset(eventset) + return ret == PAPI_OK + + def cleanup_eventset(self, eventset): + """Cleanup event set""" + self.papi.PAPI_cleanup_eventset(eventset) + + def destroy_eventset(self, eventset): + """Destroy event set""" + es = ctypes.c_int(eventset) + self.papi.PAPI_destroy_eventset(ctypes.byref(es)) + + def shutdown(self): + """Shutdown PAPI""" + self.papi.PAPI_shutdown() + + def strerror(self, code): + """Get error string""" + msg = self.papi.PAPI_strerror(code) + return msg.decode() if msg else f"Unknown error {code}" + +# PyTorch Initialization and Workload + +_pytorch_initialized = False +_torch = None +_hthpu = None +_device = None + +def init_pytorch(): + """Initialize PyTorch and acquire the Gaudi2 device BEFORE PAPI init""" + global _pytorch_initialized, _torch, _hthpu, _device + + if _pytorch_initialized: + return True + + try: + import torch + import habana_frameworks.torch.hpu as hthpu + + _torch = torch + _hthpu = hthpu + + print(" Initializing PyTorch HPU...") + _device = torch.device("hpu") + + # Force device initialization by creating a small tensor + _ = torch.zeros(1, device=_device) + hthpu.synchronize() + + _pytorch_initialized = True + print(" PyTorch HPU initialized successfully") + return True + + except ImportError as e: + print(f" PyTorch/Habana not available: {e}") + return False + except Exception as e: + print(f" Failed to initialize PyTorch HPU: {e}") + return False + +def run_pytorch_workload(): + """Run a PyTorch matmul workload on Gaudi2""" + global _torch, _hthpu, _device + + if not _pytorch_initialized: + print(" ERROR: PyTorch not initialized") + return False + + try: + dtype = _torch.float32 + size = 1024 + a = _torch.randn(size, size, dtype=dtype, device=_device) + b = _torch.randn(size, size, dtype=dtype, device=_device) + + # Warm-up + for _ in range(3): + c = _torch.matmul(a, b) + _hthpu.synchronize() + + # Actual workload + for _ in range(10): + c = _torch.matmul(a, b) + _hthpu.synchronize() + + return True + + except Exception as e: + print(f" Workload failed: {e}") + return False + +# Main Test + +def main(): + # Step 0: Initialize PyTorch FIRST to acquire the device + print("[0] Initializing PyTorch HPU (must be done before PAPI)...") + if not init_pytorch(): + print(" SKIP: PyTorch HPU not available") + return 0 + + # Step 1: Load PAPI + print("\n[1] Loading PAPI library...") + papi_lib = load_papi() + if not papi_lib: + print("\nFAILED") + return 1 + + papi = PAPIWrapper(papi_lib) + + # Step 2: Initialize PAPI + print("\n[2] Initializing PAPI...") + if not papi.library_init(): + print("\nFAILED") + return 1 + print(" PAPI initialized successfully") + + # Step 3: Create event set + print("\n[3] Creating event set...") + eventset = papi.create_eventset() + if eventset is None: + papi.shutdown() + print("\nFAILED") + return 1 + print(f" Event set created: {eventset}") + + # Step 4: Add Gaudi2 events + print("\n[4] Adding Gaudi2 events...") + events = [ + "gaudi2:::TPC_KERNEL_EXECUTED:device=0", + "gaudi2:::TPC_STALL:device=0", + "gaudi2:::TPC_VECTOR_PIPE_EXEC:device=0", + "gaudi2:::TPC_ICACHE_HIT:device=0", + "gaudi2:::TPC_DCACHE_HIT:device=0", + ] + + num_events = 0 + for event in events: + if papi.add_named_event(eventset, event): + print(f" Added: {event}") + num_events += 1 + else: + print(f" Failed: {event}") + + if num_events == 0: + print("\n ERROR: No events added. Is the gaudi2 component built?") + papi.destroy_eventset(eventset) + papi.shutdown() + print("\nFAILED") + return 1 + + # Step 5: Start counting + print(f"\n[5] Starting counters ({num_events} events)...") + if not papi.start(eventset): + papi.cleanup_eventset(eventset) + papi.destroy_eventset(eventset) + papi.shutdown() + print("\nFAILED") + return 1 + print(" Counters started") + + # Step 6: Run workload + print("\n[6] Running PyTorch workload...") + if not run_pytorch_workload(): + print(" WARNING: Workload failed, counters may be zero") + + # Step 7: Stop and read + print("\n[7] Stopping counters and reading values...") + values = papi.stop(eventset, num_events) + if values is None: + papi.cleanup_eventset(eventset) + papi.destroy_eventset(eventset) + papi.shutdown() + print("\nFAILED") + return 1 + + print("\n Results:") + print(" " + "-" * 55) + all_valid = True + for i, (event, value) in enumerate(zip(events[:num_events], values)): + event_name = event.split(":::")[-1] + print(f" {event_name:<40}: {value:>12,}") + if value < 0: + print(f" WARNING: Negative counter value for {event_name}") + all_valid = False + print(" " + "-" * 55) + + if not all_valid: + print("\n ERROR: Some counter values are invalid (negative)") + + # Step 8: Cleanup + print("\n[8] Cleaning up...") + papi.cleanup_eventset(eventset) + papi.destroy_eventset(eventset) + papi.shutdown() + print(" Done") + + if all_valid: + print("PASSED") + else: + print("FAILED") + + return 0 if all_valid else 1 + +if __name__ == "__main__": + sys.exit(main())