Skip to content

Commit 9e89b4d

Browse files
authored
GTPin v3 gpu_inst_count tool update
* Updated GTPin download and unpack script * Rewritten instruction count tool with GTPin v3 * Tests were enabled. Tests were not changed for this tool, Passrate 3/3
1 parent bd1a411 commit 9e89b4d

29 files changed

+3089
-292
lines changed

.github/workflows/sdk_build_and_test.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,3 +93,8 @@ jobs:
9393
if: always()
9494
run: |
9595
python ./tests/run.py -s cl_gpu_metrics
96+
97+
- name: Build-and-test-gpu-inst-count
98+
if: always()
99+
run: |
100+
python ./tests/run.py -s gpu_inst_count

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.49.16
1+
0.49.17

build_utils/CMakeLists.txt

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -423,9 +423,7 @@ macro(FindGTPinLibrary_legacy TARGET)
423423
DEPENDS ${GTPIN_LIB_PATH}/GTPIN/libgcc_s.so.1
424424
${GTPIN_LIB_PATH}/GTPIN/libged.so
425425
${GTPIN_LIB_PATH}/GTPIN/libgtpin.so
426-
${GTPIN_LIB_PATH}/GTPIN/libgtpin_core.so
427-
${GTPIN_LIB_PATH}/GTPIN/libiga_wrapper.so
428-
${GTPIN_LIB_PATH}/GTPIN/libstdc++.so.6)
426+
${GTPIN_LIB_PATH}/GTPIN/libgtpin_core.so)
429427
add_custom_command(OUTPUT ${GTPIN_LIB_PATH}/GTPIN/libgcc_s.so.1
430428
${GTPIN_LIB_PATH}/GTPIN/libged.so
431429
${GTPIN_LIB_PATH}/GTPIN/libgtpin.so
@@ -542,13 +540,27 @@ macro(GetGTPinPackage TARGET)
542540
endif()
543541
endif()
544542
include(FetchContent)
545-
FetchContent_Declare(
546-
gtpin_package
547-
URL ${GTPIN_LINK}
548-
)
543+
if(${CMAKE_VERSION} VERSION_LESS "3.24.0")
544+
FetchContent_Declare(
545+
gtpin_package
546+
URL ${GTPIN_LINK}
547+
)
548+
else()
549+
FetchContent_Declare(
550+
gtpin_package
551+
URL ${GTPIN_LINK}
552+
DOWNLOAD_EXTRACT_TIMESTAMP YES
553+
)
554+
endif()
549555
message(STATUS "Download GTPin package link: ${GTPIN_LINK}")
550556
FetchContent_MakeAvailable(gtpin_package)
551-
set(GTPIN_PATH ${gtpin_package_SOURCE_DIR})
557+
if(UNIX)
558+
set(GTPIN_PATH ${gtpin_package_SOURCE_DIR})
559+
elseif(WIN32)
560+
set(GTPIN_PATH ${gtpin_package_SOURCE_DIR}/Profilers)
561+
else()
562+
message(FATAL_ERROR "Graphics Technology Pin (GT Pin) is not supported for the platform")
563+
endif()
552564
message(STATUS "GTPin Technology Pin (GT Pin) unpacked: ${GTPIN_PATH}")
553565
else()
554566
if(NOT IS_ABSOLUTE ${GTPIN_PATH})

build_utils/get_gtpin_headers_legacy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,4 +75,4 @@ def main():
7575
"gt_basic_utils.h"])
7676

7777
if __name__ == "__main__":
78-
main()
78+
main()

build_utils/get_gtpin_libs_legacy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,4 @@ def main():
5151
build_utils.copy(src_path, build_path, gtpin_dlls)
5252

5353
if __name__ == "__main__":
54-
main()
54+
main()

samples/gpu_inst_count/CMakeLists.txt

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,28 +7,34 @@ SetCompilerFlags()
77
SetBuildType()
88

99
# Tool Library
10+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
1011

11-
add_library(gput_inst_count SHARED "${PROJECT_SOURCE_DIR}/../../loader/init.cc" tool.cc)
12+
add_library(gput_inst_count SHARED
13+
"${PROJECT_SOURCE_DIR}/tool.cc"
14+
"${PROJECT_SOURCE_DIR}/../../loader/init.cc"
15+
"${PROJECT_SOURCE_DIR}/gpu_inst_count.cc"
16+
)
1217
target_include_directories(gput_inst_count
1318
PRIVATE "${PROJECT_SOURCE_DIR}/../../utils")
1419
if(CMAKE_INCLUDE_PATH)
1520
target_include_directories(gput_inst_count
1621
PUBLIC "${CMAKE_INCLUDE_PATH}")
1722
endif()
1823

19-
FindIGALibrary(gput_inst_count)
20-
GetIGAHeaders(gput_inst_count)
24+
add_subdirectory(gtpin_dev_kit)
25+
FindGTPinDevKitHeaders(gput_inst_count)
26+
target_link_libraries(gput_inst_count gtpin_dev_kit)
2127

22-
FindGTPinLibrary_legacy(gput_inst_count)
23-
GetGTPinHeaders_legacy(gput_inst_count)
28+
FindGTPinLibrary(gput_inst_count)
29+
FindGTPinHeaders(gput_inst_count)
30+
FindGTPinUtils(gput_inst_count)
2431

2532
# Loader
26-
2733
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTOOL_NAME=gput_inst_count")
2834
add_executable(gpu_inst_count "${PROJECT_SOURCE_DIR}/../../loader/loader.cc")
2935
target_include_directories(gpu_inst_count
3036
PRIVATE "${PROJECT_SOURCE_DIR}/../../utils")
3137
if(UNIX)
3238
target_link_libraries(gpu_inst_count
3339
dl)
34-
endif()
40+
endif()

samples/gpu_inst_count/README.md

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
# GPU Instruction Count
2+
23
## Overview
4+
35
This sample is a simple LD_PRELOAD based tool that allows to collect dynamic execution count for every OpenCL(TM) kernel instruction.
46

57
As a result, assembly listing annotated with dynamic instruction count for each kernel will be printed.
6-
```
8+
9+
```console
710
=== GEMM (runs 4 times) ===
811
[ 32768] 0x0000: (W) mov (8|M0) r5.0<1>:ud r0.0<1;1,0>:ud
912
[ 32768] 0x0010: (W) or (1|M0) cr0.0<1>:ud cr0.0<0;1,0>:ud 0x4C0:uw {Switch}
@@ -72,62 +75,88 @@ As a result, assembly listing annotated with dynamic instruction count for each
7275
[ 32768] 0x0398: illegal
7376
[ 32768] 0x03A8: illegal
7477
```
78+
7579
## Supported OS
80+
7681
- Linux
7782
- Windows (*under development*)
7883

7984
## Prerequisites
85+
8086
- [CMake](https://cmake.org/) (version 3.12 and above)
8187
- [Git](https://git-scm.com/) (version 1.8 and above)
8288
- [Python](https://www.python.org/) (version 2.7 and above)
8389
- [Graphics Technology Pin (GT Pin)](https://software.intel.com/content/www/us/en/develop/articles/gtpin.html)
8490

8591
## Build and Run
92+
8693
### Linux
94+
8795
Run the following commands to build the sample:
96+
8897
```sh
8998
cd <pti>/samples/gpu_inst_count
9099
mkdir build
91100
cd build
92101
cmake -DCMAKE_BUILD_TYPE=Release [-DGTPIN_PATH=<gtpin>/Profilers] ..
93102
make
94103
```
104+
95105
Use this command line to run the tool:
106+
96107
```sh
97108
./gpu_inst_count <target_application>
98109
```
110+
99111
One may use [cl_gemm](../cl_gemm), [ze_gemm](../ze_gemm) or [dpc_gemm](../dpc_gemm) as target application:
112+
100113
```sh
101114
./gpu_inst_count ../../cl_gemm/build/cl_gemm
102115
./gpu_inst_count ../../ze_gemm/build/ze_gemm
103116
./gpu_inst_count ../../dpc_gemm/build/dpc_gemm
104117
```
118+
105119
### Windows
120+
106121
Use Microsoft* Visual Studio x64 command prompt to run the following commands and build the sample:
122+
107123
```sh
108124
cd <pti>\samples\gpu_inst_count
109125
mkdir build
110126
cd build
111-
cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DGTPIN_PATH=<gtpin>\Profilers -DCMAKE_LIBRARY_PATH=<iga_lib_path> ..
127+
cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release ..
112128
nmake
113129
```
130+
114131
Use this command line to run the tool:
132+
115133
```sh
116134
set PATH=%PATH%;<gtpin>\Profilers\Lib\intel64
117135
gpu_inst_count.exe <target_application>
118136
```
137+
138+
CMake unpacks GTPin into "_deps\gtpin_package-src\Profilers":
139+
140+
```sh
141+
set PATH=%PATH%;_deps\gtpin_package-src\Profilers\Lib\intel64
142+
gpu_inst_count.exe <target_application>
143+
```
144+
119145
One may use [cl_gemm](../cl_gemm), [ze_gemm](../ze_gemm) or [dpc_gemm](../dpc_gemm) as target application:
146+
120147
```sh
121148
set PATH=%PATH%;<gtpin>\Profilers\Lib\intel64
122149
gpu_inst_count.exe ..\..\cl_gemm\build\cl_gemm.exe
123150
gpu_inst_count.exe ..\..\ze_gemm\build\ze_gemm.exe
124151
gpu_inst_count.exe ..\..\dpc_gemm\build\dpc_gemm.exe
125152
```
126-
**Note**: to build this sample one may need to generate *.lib file from IGA *.dll (see [here](https://stackoverflow.com/questions/9946322/how-to-generate-an-import-library-lib-file-from-a-dll) for details) and provide the path to this *.lib to cmake with `-DCMAKE_LIBRARY_PATH`.
153+
154+
**Note**: to build this sample one may need to generate \*.lib file from IGA \*.dll (see [here](https://stackoverflow.com/questions/9946322/how-to-generate-an-import-library-lib-file-from-a-dll) for details) and provide the path to this \*.lib to cmake with `-DCMAKE_LIBRARY_PATH`.
127155

128156
Also one may need to add an actual path to IGA *.dll into PATH before sample run, e.g.:
129-
```
157+
158+
```sh
130159
set PATH=%PATH%;<gtpin>\Profilers\Lib\intel64
131160
set PATH=%PATH%;<iga_dll_path>
132161
gpu_inst_count.exe ..\..\cl_gemm\build\cl_gemm.exe
133-
```
162+
```
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
2+
#include "gpu_inst_count.hpp"
3+
4+
using namespace gtpin_prof;
5+
using namespace gtpin;
6+
7+
/********************
8+
* Requered functions - should be implemented
9+
*/
10+
11+
PROF_STATUS GpuInstCountKernel::Accumulate(std::shared_ptr<ResultData> profilingResult,
12+
GTPinProfileRecord* record) {
13+
auto gpuInstCountRec = reinterpret_cast<GpuInstCountRecord*>(record);
14+
auto gpuInstCountResult = std::dynamic_pointer_cast<GpuInstCountResultData>(profilingResult);
15+
16+
/// Accumulate data from GpuInstCountRec to GpuInstCountResult here.
17+
/// For each profiling results may be several records, data should be
18+
/// accumulated, not just transferred
19+
gpuInstCountResult->count += gpuInstCountRec->count;
20+
21+
return PROF_STATUS_SUCCESS;
22+
}
23+
24+
PROF_STATUS GpuInstCountKernel::AnalyzeKernel(IGtKernelInstrument& instrumentor) {
25+
const IGtKernel& kernel = instrumentor.Kernel();
26+
const IGtCfg& cfg = instrumentor.Cfg();
27+
const IGtGenArch& genArch = GTPin_GetCore()->GenArch();
28+
29+
SetRecordSize(sizeof(GpuInstCountRecord));
30+
SetDefautBuckets(instrumentor);
31+
32+
for (auto bblPtr : cfg.Bbls()) {
33+
for (auto insPtr : bblPtr->Instructions()) {
34+
const IGtIns& ins = *insPtr;
35+
const InstructionOffset offset = cfg.GetInstructionOffset(ins);
36+
37+
bblData.emplace(offset, bblPtr->FirstIns());
38+
}
39+
}
40+
/// Set number of records and store required data based on information from
41+
/// instrumentor
42+
SetRecordsNum(bblData.size());
43+
44+
return PROF_STATUS_SUCCESS;
45+
}
46+
47+
PROF_STATUS GpuInstCountKernel::Instrument(IGtKernelInstrument& instrumentor) {
48+
const IGtKernel& kernel = instrumentor.Kernel();
49+
const IGtCfg& cfg = instrumentor.Cfg();
50+
const IGtGenCoder& coder = instrumentor.Coder();
51+
IGtVregFactory& vregs = coder.VregFactory();
52+
IGtInsFactory& insF = coder.InstructionFactory();
53+
54+
const IGtGenArch& genArch = GTPin_GetCore()->GenArch();
55+
uint32_t grfRegSize = insF.GenModel().GrfRegSize(); // bytes
56+
57+
GtGenProcedure proc;
58+
59+
size_t bblIdx = 0;
60+
for (auto it = bblData.begin(); it != bblData.end(); it++, bblIdx++) {
61+
GtGenProcedure proc;
62+
PointOfInterest poi(instrumentor, m_profileArray, bblIdx);
63+
poi.InstructionCounterAnalysis(offsetof(GpuInstCountRecord, count));
64+
poi.ClosePOI(proc);
65+
instrumentor.InstrumentInstruction(it->second, GtIpoint::Before(), proc);
66+
}
67+
68+
return PROF_STATUS_SUCCESS;
69+
}
70+
71+
/********************
72+
* Optional functions - may be changed or not, base on tool behaviour
73+
*/
74+
75+
PROF_STATUS GpuInstCountKernel::InitResultData(std::shared_ptr<InvocationData> invocationData,
76+
IGtKernelDispatch& dispatcher,
77+
const GTPinKernelExecDesriptor& execDescr,
78+
const std::shared_ptr<IToolFactory> factory) {
79+
auto invData = std::dynamic_pointer_cast<GpuInstCountInvocationData>(invocationData);
80+
PTI_ASSERT((invData != nullptr) && "Invocation data was wrongly initialized. Check factory.");
81+
82+
size_t idx = 0;
83+
for (auto it = bblData.begin(); it != bblData.end(); it++, idx++) {
84+
auto resData = factory->MakeResultData();
85+
auto gpuInstCountResult = std::dynamic_pointer_cast<GpuInstCountResultData>(resData);
86+
gpuInstCountResult->instructionOffset = it->first;
87+
invData->data.push_back(gpuInstCountResult);
88+
}
89+
90+
return PROF_STATUS_SUCCESS;
91+
};
92+
93+
PROF_STATUS GpuInstCountKernel::PostProcData(std::shared_ptr<InvocationData> invocationData) {
94+
return PROF_STATUS_SUCCESS;
95+
}
96+
97+
/**
98+
* GpuInstCount implementations
99+
*/
100+
std::vector<const char*> GpuInstCount::SetGtpinKnobs() const {
101+
return std::vector<const char*>{"--no_empty_profile_dir"};
102+
};
103+
104+
/**
105+
* GpuInstCountFactory implementations
106+
*/
107+
std::shared_ptr<GTPinProfileKernel> GpuInstCountFactory::MakeKernel(
108+
IGtKernelInstrument& instrumentor, std::shared_ptr<KernelData> kernelData) {
109+
return std::make_shared<GpuInstCountKernel>(instrumentor, kernelData);
110+
}
111+
112+
GTPinProfileRecord* GpuInstCountFactory::MakeRecord() {
113+
GpuInstCountRecord* rec = new GpuInstCountRecord();
114+
return rec;
115+
};
116+
117+
std::shared_ptr<ProfilerData> GpuInstCountFactory::MakeProfilerData() {
118+
return std::make_shared<GpuInstCountProfilerData>();
119+
};
120+
121+
std::shared_ptr<KernelData> GpuInstCountFactory::MakeKernelData(IGtKernelInstrument& instrumentor) {
122+
return std::make_shared<GpuInstCountKernelData>(instrumentor);
123+
};
124+
125+
std::shared_ptr<InvocationData> GpuInstCountFactory::MakeInvocationData(
126+
const GTPinKernelExecDesriptor& execDescr) {
127+
return std::make_shared<GpuInstCountInvocationData>(execDescr);
128+
};
129+
130+
std::shared_ptr<ResultData> GpuInstCountFactory::MakeResultData() {
131+
return std::make_shared<GpuInstCountResultData>();
132+
};

0 commit comments

Comments
 (0)