Skip to content

Commit ab0fe21

Browse files
committed
Restructure Tile Engine's benchmarking process
This change restructures the Benchmark structs into 3 files. There is an addition of a base class for all GEMM benchmarks, derived classes for Universal GEMM, multi dim GEMM, and GEMM preshuffle. Common functions have been relocated into a common directory. For any derived base classes, only the redefination of the constructor is needed, significantly mitigating the need for duplicated code. Restructure Tile Engine's profiling process This change restructures the profiling process in Tile Engine into a base class for the Profiling and Problem structs. With this all files needed for Tile Engine will have a base struct and files in the gemm/ directory that can be extended for each GEMM variant. Only the Problem and Profiler structs along with the reference functions need to be defined. Profiling functions that are common to each operation have been moved into a common utility file. Adding README back into the gemm directory and integrate new preshuffle functions disabling the gemm tile engine tests and updating preshuffle example to match new tensor_shuffle_utils interface
1 parent f0d4950 commit ab0fe21

32 files changed

+2260
-3671
lines changed

projects/composablekernel/Jenkinsfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1707,7 +1707,7 @@ pipeline {
17071707
-D GEMM_PRESHUFFLE_LAYOUT="rcr" \
17081708
-D GEMM_PRESHUFFLE_CONFIG_FILE="default_ci_config.json" .. && \
17091709
ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \
1710-
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
1710+
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
17111711
python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
17121712
python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
17131713
}
@@ -1748,7 +1748,7 @@ pipeline {
17481748
-D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
17491749
-D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \
17501750
ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all benchmark_gemm_streamk_all && \
1751-
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
1751+
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
17521752
python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
17531753
python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
17541754
}
@@ -1802,7 +1802,7 @@ pipeline {
18021802
-D GEMM_UNIVERSAL_DATATYPE="fp16" \
18031803
-D GEMM_UNIVERSAL_LAYOUT="rcr;rrr;crr;ccr" .. && \
18041804
ninja -j${nthreads()} benchmark_gemm_universal_all && \
1805-
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
1805+
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
18061806
}
18071807
steps{
18081808
buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)

projects/composablekernel/test/ck_tile/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ add_subdirectory(core)
6464
add_subdirectory(epilogue)
6565
add_subdirectory(atomic_add_op)
6666
add_subdirectory(fmha)
67-
add_subdirectory(gemm_tile_engine)
67+
# add_subdirectory(gemm_tile_engine)
6868
add_subdirectory(pooling)
6969
add_subdirectory(grouped_conv)
7070
add_subdirectory(gemm_streamk_tile_engine)

projects/composablekernel/test/ck_tile/gemm_tile_engine/CMakeLists.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# ============================================================================
1111

1212
# Locate tile_engine GEMM scripts directory
13-
set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm")
13+
set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm/gemm_universal")
1414

1515
if(NOT EXISTS ${TILE_ENGINE_GEMM_DIR})
1616
message(WARNING "Tile engine directory not found: ${TILE_ENGINE_GEMM_DIR}")
@@ -32,11 +32,11 @@ endif()
3232
# config_json - Full path to JSON configuration file
3333
# ============================================================================
3434
function(create_individual_gemm_test_target datatype layout config_name trait tile_config config_json)
35-
set(target_name "test_gemm_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
35+
set(target_name "test_gemm_universal_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
3636
set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}/${config_name}")
3737

3838
# Generated header path (already created during cmake configuration)
39-
set(test_header "${working_path}/gemm_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
39+
set(test_header "${working_path}/gemm_universal_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
4040
set(test_params_header "${working_path}/test_params.hpp")
4141

4242
# Verify header exists (should have been generated during cmake configuration)
@@ -118,7 +118,7 @@ function(build_gemm_test_targets datatype layout config_name)
118118

119119
# STEP 1: Discovery phase - list all valid kernel configurations
120120
execute_process(
121-
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py
121+
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py
122122
--working_path ${working_path}
123123
--datatype ${datatype}
124124
--layout ${layout}
@@ -178,7 +178,7 @@ function(build_gemm_test_targets datatype layout config_name)
178178

179179
# Generate header using --gen_single
180180
execute_process(
181-
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py
181+
COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py
182182
--working_path ${working_path}
183183
--gpu_target "${GEMM_TEST_GPU_TARGETS}"
184184
--datatype ${datatype}

projects/composablekernel/tile_engine/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
include_directories(BEFORE
55
${CMAKE_CURRENT_LIST_DIR}/include
6+
${CMAKE_CURRENT_LIST_DIR}/ops
67
)
78

89
add_subdirectory(ops/gemm)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2+
# SPDX-License-Identifier: MIT
Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
3+
# SPDX-License-Identifier: MIT
4+
5+
import json
6+
import subprocess
7+
import csv
8+
from pathlib import Path
9+
from typing import List, Dict, Optional
10+
11+
12+
def run_kernel(
13+
build_dir: Path, kernel_path: Path, params: Dict[str, str], verbose: bool = False
14+
) -> Optional[Dict]:
15+
"""Run a single kernel with given parameters and save output to individual JSON file"""
16+
# Create results directory
17+
results_dir = build_dir / "results"
18+
results_dir.mkdir(exist_ok=True)
19+
20+
# Generate unique JSON filename for this kernel
21+
json_file = results_dir / f"{kernel_path.stem}.json"
22+
23+
cmd = [str(kernel_path)]
24+
25+
# Add parameters
26+
for key, value in params.items():
27+
cmd.append(f"-{key}={value}")
28+
29+
# Add JSON output flag for clean JSON output
30+
cmd.append("-json_output=true")
31+
32+
if verbose:
33+
print(f"Running: {' '.join(cmd)}")
34+
35+
try:
36+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
37+
38+
if result.returncode != 0:
39+
print(f"Error running {kernel_path.name}: {result.stderr}")
40+
return None
41+
42+
# Save raw output to individual JSON file
43+
output = result.stdout.strip()
44+
if output:
45+
with open(json_file, "w") as f:
46+
f.write(output)
47+
48+
# Parse the JSON file
49+
return parse_json_file(json_file, verbose=verbose)
50+
else:
51+
print(f"No output from {kernel_path.name}")
52+
return None
53+
54+
except subprocess.TimeoutExpired:
55+
print(f"Timeout running {kernel_path.name}")
56+
return None
57+
except Exception as e:
58+
print(f"Error running {kernel_path.name}: {e}")
59+
return None
60+
61+
62+
def parse_json_file(json_file: Path, verbose: bool = False) -> Optional[Dict]:
63+
"""Parse JSON data from individual kernel output file"""
64+
try:
65+
with open(json_file, "r") as f:
66+
content = f.read().strip()
67+
68+
# Parse the JSON directly since executables produce clean JSON
69+
data = json.loads(content)
70+
71+
# Return the complete JSON data as-is, just add some convenience fields
72+
result = data.copy()
73+
if "perf_result" in data:
74+
perf = data["perf_result"]
75+
# Add convenience fields for backward compatibility
76+
result["time_ms"] = perf.get("latency(ms)", 0)
77+
result["tflops"] = perf.get("tflops(TFlops)", 0)
78+
result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
79+
80+
return result
81+
82+
except json.JSONDecodeError as e:
83+
if verbose:
84+
print(f"Failed to parse JSON from {json_file}: {e}")
85+
return None
86+
except Exception as e:
87+
if verbose:
88+
print(f"Error reading JSON file {json_file}: {e}")
89+
return None
90+
91+
92+
def find_best_kernel(results: List[Dict], metric: str = "tflops") -> Optional[Dict]:
93+
"""Find the best performing kernel based on metric"""
94+
if not results:
95+
return None
96+
97+
if metric == "tflops":
98+
return max(results, key=lambda x: x.get("tflops", 0))
99+
elif metric == "time_ms":
100+
return min(results, key=lambda x: x.get("time_ms", float("inf")))
101+
elif metric == "bandwidth_gb_s":
102+
return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
103+
else:
104+
raise ValueError(f"Unknown metric: {metric}")
105+
106+
107+
def export_csv(results: List[Dict], filename: str, verbose: bool = False):
108+
"""Export all results to CSV"""
109+
if not results:
110+
print("No results to export")
111+
return
112+
113+
# Get all unique keys from results
114+
all_keys = set()
115+
for result in results:
116+
all_keys.update(result.keys())
117+
118+
# Sort keys for consistent output
119+
fieldnames = sorted(all_keys)
120+
121+
with open(filename, "w", newline="") as csvfile:
122+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
123+
writer.writeheader()
124+
writer.writerows(results)
125+
126+
print(f"Results exported to {filename}")
127+
128+
129+
def export_best_kernels(best_kernels: Dict, filename: str, verbose: bool = False):
130+
"""Export best kernel selections to file"""
131+
with open(filename, "w") as f:
132+
f.write("# Best kernel selections\n")
133+
f.write(
134+
"# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n"
135+
)
136+
137+
for key, kernel in sorted(best_kernels.items()):
138+
f.write(
139+
f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
140+
)
141+
142+
print(f"Best kernels exported to {filename}")
143+
144+
145+
def export_json(
146+
results: List[Dict], filename: str, best_kernels: Dict = None, verbose: bool = False
147+
):
148+
"""Export all results and best kernels to JSON with comprehensive metadata"""
149+
from datetime import datetime
150+
151+
# Calculate comprehensive summary statistics for all metrics
152+
successful_results = [r for r in results if r.get("tflops", 0) > 0]
153+
154+
tflops_values = [r.get("tflops", 0) for r in successful_results]
155+
bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
156+
latency_values = [
157+
r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
158+
]
159+
160+
# Performance breakdown by kernel type
161+
pipeline_stats = {}
162+
scheduler_stats = {}
163+
data_type_stats = {}
164+
165+
for result in successful_results:
166+
# Get config info from the new structure
167+
config = result.get("config", {})
168+
169+
# Pipeline statistics
170+
pipeline = config.get("pipeline", "unknown")
171+
if pipeline not in pipeline_stats:
172+
pipeline_stats[pipeline] = {
173+
"count": 0,
174+
"avg_tflops": 0,
175+
"best_tflops": 0,
176+
}
177+
pipeline_stats[pipeline]["count"] += 1
178+
pipeline_stats[pipeline]["best_tflops"] = max(
179+
pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0)
180+
)
181+
182+
# Scheduler statistics
183+
scheduler = config.get("scheduler", "unknown")
184+
if scheduler not in scheduler_stats:
185+
scheduler_stats[scheduler] = {
186+
"count": 0,
187+
"avg_tflops": 0,
188+
"best_tflops": 0,
189+
}
190+
scheduler_stats[scheduler]["count"] += 1
191+
scheduler_stats[scheduler]["best_tflops"] = max(
192+
scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0)
193+
)
194+
195+
# Data type statistics
196+
data_type = config.get("data_type", "unknown")
197+
if data_type not in data_type_stats:
198+
data_type_stats[data_type] = {
199+
"count": 0,
200+
"avg_tflops": 0,
201+
"best_tflops": 0,
202+
}
203+
data_type_stats[data_type]["count"] += 1
204+
data_type_stats[data_type]["best_tflops"] = max(
205+
data_type_stats[data_type]["best_tflops"], result.get("tflops", 0)
206+
)
207+
208+
# Calculate averages for breakdown stats
209+
for stats_dict, field_name in [
210+
(pipeline_stats, "pipeline"),
211+
(scheduler_stats, "scheduler"),
212+
(data_type_stats, "data_type"),
213+
]:
214+
for key in stats_dict:
215+
relevant_results = [
216+
r
217+
for r in successful_results
218+
if r.get("config", {}).get(field_name, "unknown") == key
219+
]
220+
if relevant_results:
221+
stats_dict[key]["avg_tflops"] = sum(
222+
r.get("tflops", 0) for r in relevant_results
223+
) / len(relevant_results)
224+
225+
output_data = {
226+
"benchmark_metadata": {
227+
"timestamp": datetime.now().isoformat(),
228+
"total_kernels_tested": len(results),
229+
"unique_kernels": len(set(r.get("name", "unknown") for r in results)),
230+
"successful_runs": len(successful_results),
231+
"failed_runs": len(results) - len(successful_results),
232+
},
233+
"performance_summary": {
234+
"tflops_stats": {
235+
"best": max(tflops_values, default=0),
236+
"average": sum(tflops_values) / len(tflops_values)
237+
if tflops_values
238+
else 0,
239+
"min": min(tflops_values, default=0),
240+
"median": sorted(tflops_values)[len(tflops_values) // 2]
241+
if tflops_values
242+
else 0,
243+
},
244+
"bandwidth_stats": {
245+
"best_gb_s": max(bandwidth_values, default=0),
246+
"average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
247+
if bandwidth_values
248+
else 0,
249+
"min_gb_s": min(bandwidth_values, default=0),
250+
"median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2]
251+
if bandwidth_values
252+
else 0,
253+
},
254+
"latency_stats": {
255+
"best_ms": min(latency_values, default=0),
256+
"average_ms": sum(latency_values) / len(latency_values)
257+
if latency_values
258+
else 0,
259+
"max_ms": max(latency_values, default=0),
260+
"median_ms": sorted(latency_values)[len(latency_values) // 2]
261+
if latency_values
262+
else 0,
263+
},
264+
"kernel_type_breakdown": {
265+
"by_pipeline": pipeline_stats,
266+
"by_scheduler": scheduler_stats,
267+
"by_data_type": data_type_stats,
268+
},
269+
"total_problem_configurations": len(best_kernels) if best_kernels else 0,
270+
},
271+
"kernel_results": results,
272+
"best_kernels_by_problem": best_kernels or {},
273+
}
274+
275+
with open(filename, "w") as f:
276+
json.dump(output_data, f, indent=2)
277+
278+
print(f"JSON results exported to {filename}")
279+
print(f" - Total kernels: {len(results)}")
280+
print(f" - Successful runs: {len(successful_results)}")
281+
print(f" - Best TFLOPS: {max(tflops_values, default=0):.2f}")
282+
print(f" - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
283+
print(f" - Best latency: {min(latency_values, default=0):.2f}ms")

0 commit comments

Comments
 (0)