Skip to content

Commit ae0cce2

Browse files
JRPanclaude
andauthored
Add GPGPU-Sim support and migrate parboil driver to Python 3 (#76)
- Add GPGPU-Sim detection in gpuConfig.h by checking for gpgpusim.config file - Parse L2 cache configuration (nsets, linesize, assoc) from gpgpusim.config - Calculate L2_SIZE, FBP_COUNT, and L2_BANKS from simulator config when running in GPGPU-Sim - Update parboil driver from Python 2 to Python 3: - Convert print statements to print() functions - Use relative imports (from . import module) - Replace itertools.imap with map, itertools.ifilter with filter - Update exception syntax (raise Type, msg -> raise Type(msg)) - Replace .iteritems() and .itervalues() with .items() and .values() - Replace file() with open() - Fix indentation (tabs to spaces) - Use 'is' -> '==' for string comparisons in process.py Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 4becbe3 commit ae0cce2

File tree

8 files changed

+209
-170
lines changed

8 files changed

+209
-170
lines changed

src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h

Lines changed: 81 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
#include <cstdlib>
66
#include <cstdint>
77
#include <iostream>
8+
#include <fstream>
9+
#include <sstream>
810
#include <cuda_runtime.h>
911
#include <fcntl.h>
1012
#include <unistd.h>
@@ -237,46 +239,85 @@ inline unsigned queryGrInfo(uint32_t info_index)
237239

238240
unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[])
239241
{
240-
cudaSetDevice(deviceID);
241-
cudaGetDeviceProperties(&deviceProp, deviceID);
242-
243-
int clockRateKHz;
244-
cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceID);
245-
246-
// core stats
247-
248-
config.SM_NUMBER = deviceProp.multiProcessorCount;
249-
config.MAX_THREADS_PER_SM = deviceProp.maxThreadsPerMultiProcessor;
250-
config.MAX_SHARED_MEM_SIZE = deviceProp.sharedMemPerMultiprocessor;
251-
config.WARP_SIZE = deviceProp.warpSize;
252-
config.MAX_WARPS_PER_SM =
253-
deviceProp.maxThreadsPerMultiProcessor / deviceProp.warpSize;
254-
config.MAX_REG_PER_SM = deviceProp.regsPerMultiprocessor;
255-
256-
// threadblock stats
257-
config.MAX_THREAD_BLOCK_SIZE = deviceProp.maxThreadsPerBlock;
258-
config.MAX_SHARED_MEM_SIZE_PER_BLOCK = deviceProp.sharedMemPerBlock;
259-
config.MAX_REG_PER_BLOCK = deviceProp.regsPerBlock;
260-
261-
// launched thread blocks to ensure GPU is fully occupied as much as possible
262-
config.THREADS_PER_BLOCK = deviceProp.maxThreadsPerBlock;
263-
config.BLOCKS_PER_SM =
264-
deviceProp.maxThreadsPerMultiProcessor / deviceProp.maxThreadsPerBlock;
265-
config.THREADS_PER_SM = config.BLOCKS_PER_SM * config.THREADS_PER_BLOCK;
266-
config.BLOCKS_NUM = config.BLOCKS_PER_SM * config.SM_NUMBER;
267-
config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM;
268-
269-
// L2 cache
270-
config.L2_SIZE = deviceProp.l2CacheSize;
271-
272-
// memory
273-
config.MEM_SIZE = deviceProp.totalGlobalMem;
274-
config.MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f;
275-
config.MEM_BITWIDTH = deviceProp.memoryBusWidth;
276-
config.CLK_FREQUENCY = clockRateKHz * 1e-3f;
277-
278-
config.FBP_COUNT = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS);
279-
config.L2_BANKS = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_LTCS);
242+
// Check if running in GPGPU-Sim by looking for gpgpusim.config
243+
std::ifstream configFile("gpgpusim.config");
244+
bool isGpgpuSim = configFile.is_open();
245+
246+
if (isGpgpuSim) {
247+
// Parse gpgpusim.config for available parameters
248+
unsigned n_mem = 32, n_sub_partition = 2; // defaults
249+
unsigned l2_nsets = 32, l2_linesize = 128, l2_assoc = 24; // defaults for L2 per bank
250+
std::string line;
251+
while (std::getline(configFile, line)) {
252+
std::istringstream iss(line);
253+
std::string key;
254+
if (iss >> key) {
255+
if (key == "-gpgpu_n_mem") {
256+
iss >> n_mem; // number of memory controllers
257+
} else if (key == "-gpgpu_n_sub_partition_per_mchannel") {
258+
iss >> n_sub_partition; // number of L2 banks per memory controller
259+
} else if (key == "-gpgpu_cache:dl2") {
260+
// Format: X:nsets:linesize:assoc,... where X is any letter
261+
std::string cacheConfig;
262+
iss >> cacheConfig;
263+
// Parse X:nsets:linesize:assoc using sscanf, skip first char
264+
char dummy;
265+
sscanf(cacheConfig.c_str(), "%c:%u:%u:%u", &dummy, &l2_nsets, &l2_linesize, &l2_assoc);
266+
}
267+
}
268+
}
269+
configFile.close();
270+
271+
// Use struct default values (already initialized in GpuConfig)
272+
// Override FBP_COUNT and L2_BANKS from gpgpusim.config
273+
config.FBP_COUNT = n_mem;
274+
config.L2_BANKS = n_mem * n_sub_partition;
275+
// L2_SIZE = (nsets * linesize * assoc) per bank * banks_per_controller * num_controllers
276+
size_t l2_size_per_bank = (size_t)l2_nsets * l2_linesize * l2_assoc;
277+
config.L2_SIZE = l2_size_per_bank * n_sub_partition * n_mem;
278+
} else {
279+
// Running on real hardware - query device properties
280+
cudaSetDevice(deviceID);
281+
cudaGetDeviceProperties(&deviceProp, deviceID);
282+
283+
int clockRateKHz;
284+
cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceID);
285+
286+
// core stats
287+
config.SM_NUMBER = deviceProp.multiProcessorCount;
288+
config.MAX_THREADS_PER_SM = deviceProp.maxThreadsPerMultiProcessor;
289+
config.MAX_SHARED_MEM_SIZE = deviceProp.sharedMemPerMultiprocessor;
290+
config.WARP_SIZE = deviceProp.warpSize;
291+
config.MAX_WARPS_PER_SM =
292+
deviceProp.maxThreadsPerMultiProcessor / deviceProp.warpSize;
293+
config.MAX_REG_PER_SM = deviceProp.regsPerMultiprocessor;
294+
295+
// threadblock stats
296+
config.MAX_THREAD_BLOCK_SIZE = deviceProp.maxThreadsPerBlock;
297+
config.MAX_SHARED_MEM_SIZE_PER_BLOCK = deviceProp.sharedMemPerBlock;
298+
config.MAX_REG_PER_BLOCK = deviceProp.regsPerBlock;
299+
300+
// launched thread blocks to ensure GPU is fully occupied as much as possible
301+
config.THREADS_PER_BLOCK = deviceProp.maxThreadsPerBlock;
302+
config.BLOCKS_PER_SM =
303+
deviceProp.maxThreadsPerMultiProcessor / deviceProp.maxThreadsPerBlock;
304+
config.THREADS_PER_SM = config.BLOCKS_PER_SM * config.THREADS_PER_BLOCK;
305+
config.BLOCKS_NUM = config.BLOCKS_PER_SM * config.SM_NUMBER;
306+
config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM;
307+
308+
// L2 cache
309+
config.L2_SIZE = deviceProp.l2CacheSize;
310+
311+
// memory
312+
config.MEM_SIZE = deviceProp.totalGlobalMem;
313+
config.MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f;
314+
config.MEM_BITWIDTH = deviceProp.memoryBusWidth;
315+
config.CLK_FREQUENCY = clockRateKHz * 1e-3f;
316+
317+
// Get FBP_COUNT and L2_BANKS from NVIDIA RM API
318+
config.FBP_COUNT = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS);
319+
config.L2_BANKS = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_LTCS);
320+
}
280321

281322
parseGpuConfigArgs(argc, argv);
282323
printGpuConfig();

src/cuda/parboil/driver/__init__.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,13 @@
22

33
import sys
44
import os
5-
from itertools import imap
6-
7-
import globals
8-
import actions
9-
import options
10-
import parboilfile
11-
import process
12-
import benchmark
5+
6+
from . import globals
7+
from . import actions
8+
from . import options
9+
from . import parboilfile
10+
from . import process
11+
from . import benchmark
1312

1413
def run():
1514
# Print a banner message

src/cuda/parboil/driver/actions.py

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,24 +4,23 @@
44
# call lower-level routines from the process or benchmark modules.
55

66
import os
7-
from itertools import imap
87

9-
import process
10-
import benchmark
11-
import globals
12-
from text import format_columns
8+
from . import process
9+
from . import benchmark
10+
from . import globals
11+
from .text import format_columns
1312

14-
from error import ErrorType
13+
from .error import ErrorType
1514

1615
def benchmark_iter():
1716
"""Iterate over the benchmarks in 'bmks'."""
1817
# bmks is a dictionary from str to Future(Benchmark)
19-
return imap(lambda x: x.get(), globals.benchmarks.itervalues())
18+
return map(lambda x: x.get(), globals.benchmarks.values())
2019

2120
def list_benchmarks():
2221
"""List all benchmarks on standard output."""
23-
print "Benchmarks:"
24-
for bmk in benchmark_iter(): print " " + bmk.name
22+
print("Benchmarks:")
23+
for bmk in benchmark_iter(): print(" " + bmk.name)
2524

2625
def describe_benchmarks():
2726
"""Print descriptions of all benchmarks to standard output."""
@@ -30,8 +29,8 @@ def describe_benchmarks():
3029
def describe_benchmark(bmk):
3130
"""Print a description of one benchmark to standard output."""
3231

33-
print " " + bmk.name
34-
print format_columns(bmk.describe(), 4)
32+
print(" " + bmk.name)
33+
print(format_columns(bmk.describe(), 4))
3534

3635
def lookup_benchmark(name):
3736
"""Find a benchmark, given its name. Returns None if no benchmark
@@ -43,11 +42,11 @@ def lookup_benchmark(name):
4342
if bmk.invalid is None:
4443
return bmk
4544
else:
46-
print "Error in benchmark:"
47-
print str(bmk.invalid)
45+
print("Error in benchmark:")
46+
print(str(bmk.invalid))
4847
return None
4948
else:
50-
print "Cannot find benchmark"
49+
print("Cannot find benchmark")
5150
return None
5251

5352
def with_benchmark_named(name, action):
@@ -62,9 +61,9 @@ def compile_benchmark(bmk, version_name, platform=None):
6261
"""Compile the benchmark 'bmk'."""
6362
try: impl = bmk.impls[version_name]
6463
except KeyError:
65-
print "Cannot find benchmark version"
64+
print("Cannot find benchmark version")
6665
return
67-
66+
6867
return impl.build(bmk, platform)
6968

7069
def clean_benchmark(bmk, version_name=None, platform=None):
@@ -74,56 +73,56 @@ def clean_benchmark(bmk, version_name=None, platform=None):
7473
if version_name:
7574
try: impl = bmk.impls[version_name]
7675
except KeyError:
77-
print "Cannot find benchmark version"
76+
print("Cannot find benchmark version")
7877
return
7978

8079
impl.clean(bmk, platform)
8180
else:
8281
# Clean all versions
83-
for impl in bmk.impls.itervalues():
82+
for impl in bmk.impls.values():
8483
impl.clean(bmk, platform)
8584

8685
def run_benchmark(bmk, version_name, input_name, check=True, extra_opts=[], platform=None):
8786
"""Run the benchmark 'bmk'."""
8887
try: impl = bmk.impls[version_name]
8988
except KeyError:
90-
print "Cannot find benchmark version"
89+
print("Cannot find benchmark version")
9190
return ErrorType.CannotFindVersion
92-
91+
9392
try: data = bmk.datas[input_name]
9493
except KeyError:
95-
print "Cannot find data set"
94+
print("Cannot find data set")
9695
return ErrorType.CannotFindDataSet
9796

9897
# Run the benchmark
9998
ret = impl.run(bmk, data, check, extra_opts=extra_opts, platform=platform)
10099

101100
if ret is not ErrorType.Success:
102-
print "Run failed!"
101+
print("Run failed!")
103102
return ret
104103

105104
# Verify the output
106105
if check:
107106
success = impl.check(bmk, data)
108107

109108
if not success:
110-
print "Output checking tool detected a mismatch"
109+
print("Output checking tool detected a mismatch")
111110
return ErrorType.OutputMismatch
112111
else:
113-
print "Output was not checked for correctness"
112+
print("Output was not checked for correctness")
114113

115114
return ErrorType.Success
116115

117116
def debug_benchmark(bmk, version_name, input_name, check=True, extra_opts=[], platform=None):
118117
"""Debug the benchmark."""
119118
try: impl = bmk.impls[version_name]
120119
except KeyError:
121-
print "Cannot find benchmark version"
120+
print("Cannot find benchmark version")
122121
return ErrorType.CannotFindVersion
123-
122+
124123
try: data = bmk.datas[input_name]
125124
except KeyError:
126-
print "Cannot find data set"
125+
print("Cannot find data set")
127126
return ErrorType.CannotFindDataSet
128127

129128
# Run the benchmark

0 commit comments

Comments
 (0)