Skip to content

Commit 6840353

Browse files
authored
Add cache flushing capability to imex profiling. (#694)
1 parent c858863 commit 6840353

File tree

2 files changed

+20
-0
lines changed

2 files changed

+20
-0
lines changed

lib/ExecutionEngine/LEVELZERORUNTIME/LevelZeroRuntimeWrappers.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <cstdint>
2121
#include <cstdio>
2222
#include <cstdlib>
23+
#include <cstring>
2324
#include <map>
2425
#include <memory>
2526
#include <mutex>
@@ -465,6 +466,12 @@ static void launchKernel(GPUL0QUEUE *queue, ze_kernel_handle_t kernel,
465466
auto rounds = 1000;
466467
auto warmups = 3;
467468

469+
// Before each run we need to flush the L2 cache to make sure each profiling
470+
// run has the same cache state. This is done by writing to zero to a buffer
471+
// larger than the L2 cache size.
472+
size_t cacheSize = 256000000;
473+
auto *cache = allocDeviceMemory(queue, cacheSize, 64, true);
474+
468475
if (getenv("IMEX_PROFILING_RUNS")) {
469476
auto runs = strtol(getenv("IMEX_PROFILING_RUNS"), NULL, 10L);
470477
if (runs)
@@ -497,7 +504,10 @@ static void launchKernel(GPUL0QUEUE *queue, ze_kernel_handle_t kernel,
497504
maxTime = duration;
498505
if (duration < minTime)
499506
minTime = duration;
507+
// flush the cache.
508+
memset(cache, 0, cacheSize);
500509
}
510+
deallocDeviceMemory(queue, cache);
501511
fprintf(stdout,
502512
"the kernel execution time is (ms, on L0 runtime):"
503513
"avg: %.4f, min: %.4f, max: %.4f (over %d runs)\n",

lib/ExecutionEngine/SYCLRUNTIME/SyclRuntimeWrappers.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
#include "llvm/Support/raw_ostream.h"
1516
#include <algorithm>
1617
#include <array>
1718
#include <atomic>
@@ -303,6 +304,12 @@ static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
303304
auto rounds = 100;
304305
auto warmups = 3;
305306

307+
// Before each run we need to flush the L2 cache to make sure each profiling
308+
// run has the same cache state. This is done by writing to zero to a buffer
309+
// larger than the L2 cache size.
310+
size_t cacheSize = 256000000;
311+
auto *cache = allocDeviceMemory(queue, cacheSize, 64, true);
312+
306313
if (getenv("IMEX_PROFILING_RUNS")) {
307314
auto runs = strtol(getenv("IMEX_PROFILING_RUNS"), NULL, 10L);
308315
if (runs)
@@ -337,8 +344,11 @@ static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
337344
maxTime = gap;
338345
if (gap < minTime)
339346
minTime = gap;
347+
// flush the cache.
348+
memset(cache, 0, cacheSize);
340349
}
341350

351+
deallocDeviceMemory(queue, cache);
342352
fprintf(stdout,
343353
"the kernel execution time is (ms):"
344354
"avg: %.4f, min: %.4f, max: %.4f (over %d runs)\n",

0 commit comments

Comments
 (0)