From 52ae9dc7dfb32e3e383fef40a850a53ff49a24b5 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Wed, 23 Jul 2025 11:57:56 +0200 Subject: [PATCH 1/4] Repro of issue #1169 --- .github/workflows/pr_push.yml | 72 ------------------------------ .github/workflows/reusable_gpu.yml | 8 ++-- 2 files changed, 5 insertions(+), 75 deletions(-) diff --git a/.github/workflows/pr_push.yml b/.github/workflows/pr_push.yml index 52bd73756..08b957952 100644 --- a/.github/workflows/pr_push.yml +++ b/.github/workflows/pr_push.yml @@ -17,88 +17,16 @@ permissions: packages: read jobs: - CodeChecks: - uses: ./.github/workflows/reusable_checks.yml - FastBuild: - name: Fast builds - needs: [CodeChecks] - uses: ./.github/workflows/reusable_fast.yml - Build: - name: Basic builds - needs: [FastBuild] - uses: ./.github/workflows/reusable_basic.yml - DevDax: - needs: [FastBuild] - uses: ./.github/workflows/reusable_dax.yml - MultiNuma: - needs: [FastBuild] - uses: ./.github/workflows/reusable_multi_numa.yml L0: - needs: [Build] uses: ./.github/workflows/reusable_gpu.yml with: provider: "LEVEL_ZERO" runner: "L0" shared_lib: "['ON']" L0-BMG: - needs: [Build] uses: ./.github/workflows/reusable_gpu.yml with: provider: "LEVEL_ZERO" runner: "L0-BMG" shared_lib: "['ON']" os: "['Ubuntu']" - CUDA: - needs: [Build] - uses: ./.github/workflows/reusable_gpu.yml - with: - provider: "CUDA" - runner: "CUDA" - shared_lib: "['ON']" - Sanitizers: - needs: [FastBuild] - uses: ./.github/workflows/reusable_sanitizers.yml - QEMU: - needs: [FastBuild] - uses: ./.github/workflows/reusable_qemu.yml - with: - short_run: true - ProxyLib: - needs: [Build] - uses: ./.github/workflows/reusable_proxy_lib.yml - Valgrind: - needs: [Build] - uses: ./.github/workflows/reusable_valgrind.yml - Coverage: - # total coverage (on upstream only) - if: github.repository == 'oneapi-src/unified-memory-framework' - needs: [Build, DevDax, L0, CUDA, MultiNuma, QEMU, ProxyLib] - uses: ./.github/workflows/reusable_coverage.yml - secrets: inherit - with: - trigger: "${{github.event_name}}" - Coverage_partial: - # partial coverage (on forks) - if: github.repository != 'oneapi-src/unified-memory-framework' - needs: [Build, QEMU, ProxyLib] - uses: ./.github/workflows/reusable_coverage.yml - CodeQL: - needs: [Build] - permissions: - contents: read - security-events: write - uses: ./.github/workflows/reusable_codeql.yml - Trivy: - needs: [Build] - permissions: - contents: read - security-events: write - uses: ./.github/workflows/reusable_trivy.yml - Compatibility: - needs: [Build] - uses: ./.github/workflows/reusable_compatibility.yml - strategy: - matrix: - tag: ["v1.0.0"] - with: - tag: ${{matrix.tag}} diff --git a/.github/workflows/reusable_gpu.yml b/.github/workflows/reusable_gpu.yml index f0d1bcda8..78cfeaeab 100644 --- a/.github/workflows/reusable_gpu.yml +++ b/.github/workflows/reusable_gpu.yml @@ -16,7 +16,7 @@ on: os: description: A list of OSes type: string - default: "['Ubuntu', 'Windows']" + default: "['Ubuntu']" shared_lib: description: A list of options for building shared library type: string @@ -129,7 +129,8 @@ jobs: - name: Run tests (Debug) working-directory: ${{env.BUILD_DEBUG_DIR}} - run: ctest -C Debug --output-on-failure --test-dir test + # run: for i in {1..100}; do echo ">>> ITERATION no. ${i}" ; UMF_LOG="level:debug;flush:debug;output:stderr;pid:yes" ./test/test_provider_level_zero_dlopen_global || exit 1; date; done + run: for i in {1..100}; do echo ">>> ITERATION no. ${i}" ; UMF_LOG="level:debug;flush:debug;output:stderr;pid:yes" ctest -V -R provider_level_zero || exit 1; date; done - name: Run examples (Debug) working-directory: ${{env.BUILD_DEBUG_DIR}} @@ -163,7 +164,8 @@ jobs: - name: Run tests (Release) working-directory: ${{env.BUILD_RELEASE_DIR}} - run: ctest -C Release --output-on-failure --test-dir test + # run: for i in {1..1000}; do echo ">>> ITERATION no. ${i}" ; UMF_LOG="level:debug;flush:debug;output:stderr;pid:yes" ./test/test_provider_level_zero_dlopen_global || exit 1; date; done + run: for i in {1..100}; do echo ">>> ITERATION no. ${i}" ; UMF_LOG="level:debug;flush:debug;output:stderr;pid:yes" ctest -V -R provider_level_zero || exit 1; date; done - name: Run examples (Release) working-directory: ${{env.BUILD_RELEASE_DIR}} From 6521d9fe8b8cfc5e954ef31aeb5b11760c4ca9e3 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Fri, 25 Jul 2025 13:43:12 +0200 Subject: [PATCH 2/4] Replace Release with RelWithDebInfo --- .github/workflows/reusable_gpu.yml | 53 ++++-------------------------- 1 file changed, 6 insertions(+), 47 deletions(-) diff --git a/.github/workflows/reusable_gpu.yml b/.github/workflows/reusable_gpu.yml index 78cfeaeab..786f057a9 100644 --- a/.github/workflows/reusable_gpu.yml +++ b/.github/workflows/reusable_gpu.yml @@ -132,17 +132,13 @@ jobs: # run: for i in {1..100}; do echo ">>> ITERATION no. ${i}" ; UMF_LOG="level:debug;flush:debug;output:stderr;pid:yes" ./test/test_provider_level_zero_dlopen_global || exit 1; date; done run: for i in {1..100}; do echo ">>> ITERATION no. ${i}" ; UMF_LOG="level:debug;flush:debug;output:stderr;pid:yes" ctest -V -R provider_level_zero || exit 1; date; done - - name: Run examples (Debug) - working-directory: ${{env.BUILD_DEBUG_DIR}} - run: ctest --output-on-failure --test-dir examples -C Debug - - - name: Configure build (Release) + - name: Configure build (RelWithDebInfo) run: > cmake -DCMAKE_PREFIX_PATH="${{env.VCPKG_PATH}};${{env.EXTRA_CMAKE_PATH}}" -B ${{env.BUILD_RELEASE_DIR}} -DCMAKE_INSTALL_PREFIX="${{env.INSTL_RELEASE_DIR}}" - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=${{env.C_COMPILER}} -DCMAKE_CXX_COMPILER=${{env.CXX_COMPILER}} -DUMF_BUILD_SHARED_LIBRARY=${{matrix.shared_library}} @@ -159,47 +155,10 @@ jobs: -DUMF_TESTS_FAIL_ON_SKIP=ON ${{ matrix.os == 'Windows' && '-DCMAKE_SUPPRESS_REGENERATION=ON' || '' }} - - name: Build UMF (Release) - run: cmake --build ${{env.BUILD_RELEASE_DIR}} --config Release -j ${{env.PROCS}} + - name: Build UMF (RelWithDebInfo) + run: cmake --build ${{env.BUILD_RELEASE_DIR}} --config RelWithDebInfo -j ${{env.PROCS}} - - name: Run tests (Release) + - name: Run tests (RelWithDebInfo) working-directory: ${{env.BUILD_RELEASE_DIR}} # run: for i in {1..1000}; do echo ">>> ITERATION no. ${i}" ; UMF_LOG="level:debug;flush:debug;output:stderr;pid:yes" ./test/test_provider_level_zero_dlopen_global || exit 1; date; done - run: for i in {1..100}; do echo ">>> ITERATION no. ${i}" ; UMF_LOG="level:debug;flush:debug;output:stderr;pid:yes" ctest -V -R provider_level_zero || exit 1; date; done - - - name: Run examples (Release) - working-directory: ${{env.BUILD_RELEASE_DIR}} - run: ctest --output-on-failure --test-dir examples -C Release - - - name: Run benchmarks (Release) - working-directory: ${{env.BUILD_RELEASE_DIR}} - run: ctest --output-on-failure --test-dir benchmark -C Release --exclude-regex umf-multithreaded - - - name: "[Lin] Check coverage (Debug)" - if: ${{ matrix.os == 'Ubuntu' }} - working-directory: ${{env.BUILD_DEBUG_DIR}} - run: | - export COVERAGE_FILE_NAME=${{env.COVERAGE_NAME}}-shared-${{matrix.shared_library}} - echo "COVERAGE_FILE_NAME: $COVERAGE_FILE_NAME" - ${{github.workspace}}/scripts/coverage/coverage_capture.sh $COVERAGE_FILE_NAME - mkdir -p ${{env.COVERAGE_DIR}} - mv ./$COVERAGE_FILE_NAME ${{env.COVERAGE_DIR}} - - - name: "[Lin] Upload coverage" - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 - if: ${{ matrix.os == 'Ubuntu' }} - with: - name: ${{env.COVERAGE_NAME}}-shared-${{matrix.shared_library}} - path: ${{env.COVERAGE_DIR}} - - - name: "[Win] Prepare vcpkg cache" - if: matrix.os == 'Windows' && steps.cache.outputs.cache-hit != 'true' - run: | - Compress-Archive -Path ${{github.workspace}}/vcpkg/packages -DestinationPath ${{github.workspace}}/vcpkg_pkgs_cache.zip -Force -CompressionLevel Fastest - - - name: "[Win] Save vcpkg cache" - if: matrix.os == 'Windows' && steps.cache.outputs.cache-hit != 'true' - uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 - with: - path: ${{github.workspace}}/vcpkg_pkgs_cache.zip - key: ${{ steps.cache.outputs.cache-primary-key }} + run: for i in {1..1000}; do echo ">>> ITERATION no. ${i}" ; UMF_LOG="level:debug;flush:debug;output:stderr;pid:yes" ctest -V -R provider_level_zero || exit 1; date; done From a6f1d0b3d9b07719f13d7acdef5e4a998d23e853 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Thu, 31 Jul 2025 10:38:15 +0200 Subject: [PATCH 3/4] Add checksum --- src/provider/provider_level_zero.c | 31 ++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/provider/provider_level_zero.c b/src/provider/provider_level_zero.c index 8703e8fdc..fc81ce3b4 100644 --- a/src/provider/provider_level_zero.c +++ b/src/provider/provider_level_zero.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -753,9 +754,22 @@ static umf_result_t ze_memory_provider_allocation_split(void *provider, typedef struct ze_ipc_data_t { int pid; + uint64_t id; // Unique identifier for the IPC handle + uint64_t checksum; ze_ipc_mem_handle_t ze_handle; } ze_ipc_data_t; +// Compute a simple checksum of the ze_handle field in ze_ipc_data_t +static uint64_t ze_ipc_handle_checksum(const ze_ipc_data_t *ipc_data) { + // Interpret the ze_handle as a byte array + const uint8_t *bytes = (const uint8_t *)&ipc_data->ze_handle; + uint64_t checksum = 0; + for (size_t i = 0; i < sizeof(ipc_data->ze_handle); ++i) { + checksum += bytes[i]; + } + return checksum; +} + static umf_result_t ze_memory_provider_get_ipc_handle_size(void *provider, size_t *size) { (void)provider; @@ -770,6 +784,8 @@ static umf_result_t ze_memory_provider_get_ipc_handle(void *provider, void *providerIpcData) { (void)size; + static uint64_t id = 0; + ze_result_t ze_result; ze_ipc_data_t *ze_ipc_data = (ze_ipc_data_t *)providerIpcData; struct ze_memory_provider_t *ze_provider = @@ -783,6 +799,11 @@ static umf_result_t ze_memory_provider_get_ipc_handle(void *provider, } ze_ipc_data->pid = utils_getpid(); + ze_ipc_data->checksum = ze_ipc_handle_checksum(ze_ipc_data); + ze_ipc_data->id = id++; + + LOG_DEBUG("GET handle(): pid = %d, id = %lu, checksum = %lu", ze_ipc_data->pid, + ze_ipc_data->id, ze_ipc_data->checksum); return UMF_RESULT_SUCCESS; } @@ -801,6 +822,16 @@ static umf_result_t ze_memory_provider_put_ipc_handle(void *provider, return UMF_RESULT_SUCCESS; } + if (ze_ipc_data->checksum != ze_ipc_handle_checksum(ze_ipc_data)) { + LOG_FATAL( + "Checksum mismatch for IPC handle data: pid = %d, checksum = %lu", + ze_ipc_data->pid, ze_ipc_data->checksum); + return UMF_RESULT_ERROR_INVALID_ARGUMENT; + } + + LOG_DEBUG("PUT handle(): pid = %d, id = %lu, checksum = %lu", ze_ipc_data->pid, + ze_ipc_data->id, ze_ipc_data->checksum); + ze_result = g_ze_ops.zeMemPutIpcHandle(ze_provider->context, ze_ipc_data->ze_handle); if (ze_result != ZE_RESULT_SUCCESS) { From 65c9cfcd7abb9983e2bc88c581d2c3031809fedc Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Fri, 1 Aug 2025 11:19:54 +0200 Subject: [PATCH 4/4] Remove checksum --- src/provider/provider_level_zero.c | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/src/provider/provider_level_zero.c b/src/provider/provider_level_zero.c index fc81ce3b4..37f8b6b17 100644 --- a/src/provider/provider_level_zero.c +++ b/src/provider/provider_level_zero.c @@ -755,21 +755,9 @@ static umf_result_t ze_memory_provider_allocation_split(void *provider, typedef struct ze_ipc_data_t { int pid; uint64_t id; // Unique identifier for the IPC handle - uint64_t checksum; ze_ipc_mem_handle_t ze_handle; } ze_ipc_data_t; -// Compute a simple checksum of the ze_handle field in ze_ipc_data_t -static uint64_t ze_ipc_handle_checksum(const ze_ipc_data_t *ipc_data) { - // Interpret the ze_handle as a byte array - const uint8_t *bytes = (const uint8_t *)&ipc_data->ze_handle; - uint64_t checksum = 0; - for (size_t i = 0; i < sizeof(ipc_data->ze_handle); ++i) { - checksum += bytes[i]; - } - return checksum; -} - static umf_result_t ze_memory_provider_get_ipc_handle_size(void *provider, size_t *size) { (void)provider; @@ -799,11 +787,10 @@ static umf_result_t ze_memory_provider_get_ipc_handle(void *provider, } ze_ipc_data->pid = utils_getpid(); - ze_ipc_data->checksum = ze_ipc_handle_checksum(ze_ipc_data); ze_ipc_data->id = id++; - LOG_DEBUG("GET handle(): pid = %d, id = %lu, checksum = %lu", ze_ipc_data->pid, - ze_ipc_data->id, ze_ipc_data->checksum); + LOG_DEBUG("GET handle(): pid = %d, id = %lu", ze_ipc_data->pid, + ze_ipc_data->id); return UMF_RESULT_SUCCESS; } @@ -822,15 +809,8 @@ static umf_result_t ze_memory_provider_put_ipc_handle(void *provider, return UMF_RESULT_SUCCESS; } - if (ze_ipc_data->checksum != ze_ipc_handle_checksum(ze_ipc_data)) { - LOG_FATAL( - "Checksum mismatch for IPC handle data: pid = %d, checksum = %lu", - ze_ipc_data->pid, ze_ipc_data->checksum); - return UMF_RESULT_ERROR_INVALID_ARGUMENT; - } - - LOG_DEBUG("PUT handle(): pid = %d, id = %lu, checksum = %lu", ze_ipc_data->pid, - ze_ipc_data->id, ze_ipc_data->checksum); + LOG_DEBUG("PUT handle(): pid = %d, id = %lu", ze_ipc_data->pid, + ze_ipc_data->id); ze_result = g_ze_ops.zeMemPutIpcHandle(ze_provider->context, ze_ipc_data->ze_handle);