kbenzie
diff --git a/‎cmake/helpers.cmake‎
Lines changed: 11 additions & 4 deletions b/‎cmake/helpers.cmake‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎examples/collector/collector.cpp‎
Lines changed: 7 additions & 0 deletions b/‎examples/collector/collector.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/ur_api.h‎
Lines changed: 1 addition & 1 deletion b/‎include/ur_api.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/core/common.yml‎
Lines changed: 1 addition & 1 deletion b/‎scripts/core/common.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎source/adapters/cuda/CMakeLists.txt‎
Lines changed: 4 additions & 3 deletions b/‎source/adapters/cuda/CMakeLists.txt‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎source/adapters/cuda/command_buffer.cpp‎
Lines changed: 13 additions & 13 deletions b/‎source/adapters/cuda/command_buffer.cpp‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎source/adapters/cuda/device.cpp‎
Lines changed: 2 additions & 2 deletions b/‎source/adapters/cuda/device.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎source/adapters/cuda/enqueue.cpp‎
Lines changed: 25 additions & 20 deletions b/‎source/adapters/cuda/enqueue.cpp‎
Lines changed: 25 additions & 20 deletions
diff --git a/‎source/adapters/cuda/image.cpp‎
Lines changed: 3 additions & 2 deletions b/‎source/adapters/cuda/image.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎source/adapters/cuda/kernel.cpp‎
Lines changed: 2 additions & 2 deletions b/‎source/adapters/cuda/kernel.cpp‎
Lines changed: 2 additions & 2 deletions
@@ -108,18 +108,25 @@ function(add_ur_target_compile_options name)
     elseif(MSVC)
         target_compile_options(${name} PRIVATE
             $<$<CXX_COMPILER_ID:MSVC>:/MP>  # clang-cl.exe does not support /MP
-            /W3
+            /W4
+            /wd4456  # Disable: declaration of 'identifier' hides previous local declaration
+            /wd4457  # Disable: declaration of 'identifier' hides function parameter
+            /wd4458  # Disable: declaration of 'identifier' hides class member
+            /wd4459  # Disable: declaration of 'identifier' hides global declaration
             /MD$<$<CONFIG:Debug>:d>
             /GS
             /DWIN32_LEAN_AND_MEAN
             /DNOMINMAX
         )
 
-        if(UR_DEVELOPER_MODE)
+        target_compile_definitions(${name} PRIVATE
             # _CRT_SECURE_NO_WARNINGS used mainly because of getenv
-            # C4267: The compiler detected a conversion from size_t to a smaller type.
+            _CRT_SECURE_NO_WARNINGS
+        )
+
+        if(UR_DEVELOPER_MODE)
             target_compile_options(${name} PRIVATE
-                /WX /GS /D_CRT_SECURE_NO_WARNINGS /wd4267
+                /WX /GS
             )
         endif()
     endif()
 
@@ -25,7 +25,14 @@
 #include <string_view>
 
 #include "ur_api.h"
+
+#ifdef _MSC_VER
+#pragma warning(disable : 4245)
+#endif
 #include "xpti/xpti_trace_framework.h"
+#ifdef _MSC_VER
+#pragma warning(default : 4245)
+#endif
 
 constexpr uint16_t TRACE_FN_BEGIN =
     static_cast<uint16_t>(xpti::trace_point_type_t::function_with_args_begin);
 
@@ -426,7 +426,7 @@ typedef struct ur_physical_mem_handle_t_ *ur_physical_mem_handle_t;
 ///////////////////////////////////////////////////////////////////////////////
 #ifndef UR_BIT
 /// @brief Generic macro for enumerator bit masks
-#define UR_BIT(_i) (1 << _i)
+#define UR_BIT(_i) (1U << _i)
 #endif // UR_BIT
 
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -134,7 +134,7 @@ name: "$x_physical_mem_handle_t"
 type: macro
 desc: "Generic macro for enumerator bit masks"
 name: "$X_BIT( _i )"
-value: "( 1 << _i )"
+value: "( 1U << _i )"
 --- #--------------------------------------------------------------------------
 type: enum
 desc: "Defines Return/Error codes"
 
@@ -97,15 +97,16 @@ if (UR_ENABLE_TRACING)
     get_target_property(XPTI_SRC_DIR xpti SOURCE_DIR)
     set(XPTI_PROXY_SRC "${XPTI_SRC_DIR}/xpti_proxy.cpp")
   endif()
-  target_compile_definitions(${TARGET_NAME} PRIVATE
+  add_library(cuda-xpti-proxy STATIC ${XPTI_PROXY_SRC})
+  target_compile_definitions(cuda-xpti-proxy PRIVATE
     XPTI_ENABLE_INSTRUMENTATION
     XPTI_STATIC_LIBRARY
     )
-  target_include_directories(${TARGET_NAME} PRIVATE
+  target_include_directories(cuda-xpti-proxy PRIVATE
     ${XPTI_INCLUDES}
     ${CUDA_CUPTI_INCLUDE_DIR}
   )
-  target_sources(${TARGET_NAME} PRIVATE ${XPTI_PROXY_SRC})
+  target_link_libraries(${TARGET_NAME} PRIVATE cuda-xpti-proxy)
 endif()
 
 if (CUDA_cupti_LIBRARY)
 
@@ -242,7 +242,7 @@ static ur_result_t enqueueCommandBufferFillHelper(
     if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
       CUDA_MEMSET_NODE_PARAMS NodeParams = {};
       NodeParams.dst = DstPtr;
-      NodeParams.elementSize = PatternSize;
+      NodeParams.elementSize = static_cast<unsigned int>(PatternSize);
       NodeParams.height = N;
       NodeParams.pitch = PatternSize;
       NodeParams.width = 1;
@@ -508,12 +508,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
     auto &ArgIndices = hKernel->getArgIndices();
     CUDA_KERNEL_NODE_PARAMS NodeParams = {};
     NodeParams.func = CuFunc;
-    NodeParams.gridDimX = BlocksPerGrid[0];
-    NodeParams.gridDimY = BlocksPerGrid[1];
-    NodeParams.gridDimZ = BlocksPerGrid[2];
-    NodeParams.blockDimX = ThreadsPerBlock[0];
-    NodeParams.blockDimY = ThreadsPerBlock[1];
-    NodeParams.blockDimZ = ThreadsPerBlock[2];
+    NodeParams.gridDimX = static_cast<unsigned int>(BlocksPerGrid[0]);
+    NodeParams.gridDimY = static_cast<unsigned int>(BlocksPerGrid[1]);
+    NodeParams.gridDimZ = static_cast<unsigned int>(BlocksPerGrid[2]);
+    NodeParams.blockDimX = static_cast<unsigned int>(ThreadsPerBlock[0]);
+    NodeParams.blockDimY = static_cast<unsigned int>(ThreadsPerBlock[1]);
+    NodeParams.blockDimZ = static_cast<unsigned int>(ThreadsPerBlock[2]);
     NodeParams.sharedMemBytes = LocalSize;
     NodeParams.kernelParams = const_cast<void **>(ArgIndices.data());
 
@@ -1397,12 +1397,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   CUDA_KERNEL_NODE_PARAMS &Params = KernelCommandHandle->Params;
 
   Params.func = CuFunc;
-  Params.gridDimX = BlocksPerGrid[0];
-  Params.gridDimY = BlocksPerGrid[1];
-  Params.gridDimZ = BlocksPerGrid[2];
-  Params.blockDimX = ThreadsPerBlock[0];
-  Params.blockDimY = ThreadsPerBlock[1];
-  Params.blockDimZ = ThreadsPerBlock[2];
+  Params.gridDimX = static_cast<unsigned int>(BlocksPerGrid[0]);
+  Params.gridDimY = static_cast<unsigned int>(BlocksPerGrid[1]);
+  Params.gridDimZ = static_cast<unsigned int>(BlocksPerGrid[2]);
+  Params.blockDimX = static_cast<unsigned int>(ThreadsPerBlock[0]);
+  Params.blockDimY = static_cast<unsigned int>(ThreadsPerBlock[1]);
+  Params.blockDimZ = static_cast<unsigned int>(ThreadsPerBlock[2]);
   Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize();
   Params.kernelParams =
       const_cast<void **>(KernelCommandHandle->Kernel->getArgIndices().data());
 
@@ -1153,7 +1153,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform,
 
   try {
     if (pNumDevices) {
-      *pNumDevices = NumDevices;
+      *pNumDevices = static_cast<uint32_t>(NumDevices);
     }
 
     if (ReturnDevices && phDevices) {
@@ -1236,7 +1236,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
 ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
                                                    uint64_t *pDeviceTimestamp,
                                                    uint64_t *pHostTimestamp) {
-  CUevent Event;
+  CUevent Event{};
   ScopedContext Active(hDevice);
 
   if (pDeviceTimestamp) {
 
@@ -160,7 +160,7 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
   int MinGrid, MaxBlockSize;
   UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
       &MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
-      MaxBlockDim[0]));
+      static_cast<int>(MaxBlockDim[0])));
 
   roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
                                        MaxBlockDim, MaxBlockSize);
@@ -208,7 +208,7 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
       MaxWorkGroupSize = Device->getMaxWorkGroupSize();
 
       if (ProvidedLocalWorkGroupSize) {
-        auto IsValid = [&](int Dim) {
+        auto IsValid = [&](size_t Dim) {
           if (ReqdThreadsPerBlock[Dim] != 0 &&
               LocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim])
             return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
@@ -217,7 +217,8 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
               LocalWorkSize[Dim] > MaxThreadsPerBlock[Dim])
             return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
 
-          if (LocalWorkSize[Dim] > Device->getMaxWorkItemSizes(Dim))
+          if (LocalWorkSize[Dim] >
+              Device->getMaxWorkItemSizes(static_cast<int>(Dim)))
             return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
           // Checks that local work sizes are a divisor of the global work sizes
           // which includes that the local work sizes are neither larger than
@@ -489,9 +490,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
     auto &ArgIndices = hKernel->getArgIndices();
     UR_CHECK_ERROR(cuLaunchKernel(
-        CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
-        ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize,
-        CuStream, const_cast<void **>(ArgIndices.data()), nullptr));
+        CuFunc, static_cast<unsigned int>(BlocksPerGrid[0]),
+        static_cast<unsigned int>(BlocksPerGrid[1]),
+        static_cast<unsigned int>(BlocksPerGrid[2]),
+        static_cast<unsigned int>(ThreadsPerBlock[0]),
+        static_cast<unsigned int>(ThreadsPerBlock[1]),
+        static_cast<unsigned int>(ThreadsPerBlock[2]), LocalSize, CuStream,
+        const_cast<void **>(ArgIndices.data()), nullptr));
 
     if (LocalSize != 0)
       hKernel->clearLocalSize();
@@ -657,12 +662,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     auto &ArgIndices = hKernel->getArgIndices();
 
     CUlaunchConfig launch_config;
-    launch_config.gridDimX = BlocksPerGrid[0];
-    launch_config.gridDimY = BlocksPerGrid[1];
-    launch_config.gridDimZ = BlocksPerGrid[2];
-    launch_config.blockDimX = ThreadsPerBlock[0];
-    launch_config.blockDimY = ThreadsPerBlock[1];
-    launch_config.blockDimZ = ThreadsPerBlock[2];
+    launch_config.gridDimX = static_cast<unsigned int>(BlocksPerGrid[0]);
+    launch_config.gridDimY = static_cast<unsigned int>(BlocksPerGrid[1]);
+    launch_config.gridDimZ = static_cast<unsigned int>(BlocksPerGrid[2]);
+    launch_config.blockDimX = static_cast<unsigned int>(ThreadsPerBlock[0]);
+    launch_config.blockDimY = static_cast<unsigned int>(ThreadsPerBlock[1]);
+    launch_config.blockDimZ = static_cast<unsigned int>(ThreadsPerBlock[2]);
 
     launch_config.sharedMemBytes = LocalSize;
     launch_config.hStream = CuStream;
@@ -1075,8 +1080,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
       break;
     }
     default: {
-      UR_CHECK_ERROR(commonMemSetLargePattern(Stream, patternSize, size,
-                                              pPattern, DstDevice));
+      UR_CHECK_ERROR(
+          commonMemSetLargePattern(Stream, static_cast<uint32_t>(patternSize),
+                                   size, pPattern, DstDevice));
       break;
     }
     }
@@ -1108,7 +1114,6 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) {
     return 4;
   default:
     detail::ur::die("Invalid image format.");
-    return 0;
   }
 }
 
@@ -1212,7 +1217,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     CUDA_ARRAY_DESCRIPTOR ArrayDesc;
     UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
 
-    int ElementByteSize = imageElementByteSize(ArrayDesc);
+    int ElementByteSize = static_cast<int>(imageElementByteSize(ArrayDesc));
 
     size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels;
     size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width;
@@ -1285,7 +1290,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
     CUDA_ARRAY_DESCRIPTOR ArrayDesc;
     UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
 
-    int ElementByteSize = imageElementByteSize(ArrayDesc);
+    int ElementByteSize = static_cast<int>(imageElementByteSize(ArrayDesc));
 
     size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels;
     size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width;
@@ -1364,7 +1369,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
     UR_ASSERT(SrcArrayDesc.NumChannels == DstArrayDesc.NumChannels,
               UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-    int ElementByteSize = imageElementByteSize(SrcArrayDesc);
+    int ElementByteSize = static_cast<int>(imageElementByteSize(SrcArrayDesc));
 
     size_t DstByteOffsetX =
         dstOrigin.x * ElementByteSize * SrcArrayDesc.NumChannels;
@@ -1549,8 +1554,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
           CuStream));
       break;
     default:
-      commonMemSetLargePattern(CuStream, patternSize, size, pPattern,
-                               (CUdeviceptr)ptr);
+      commonMemSetLargePattern(CuStream, static_cast<uint32_t>(patternSize),
+                               size, pPattern, (CUdeviceptr)ptr);
       break;
     }
     if (phEvent) {
 
@@ -284,8 +284,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp(
   ur_result_t Result = UR_RESULT_SUCCESS;
   try {
     ScopedContext Active(hDevice);
-    UR_CHECK_ERROR(cuMemAllocPitch((CUdeviceptr *)ppMem, pResultPitch,
-                                   widthInBytes, height, elementSizeBytes));
+    UR_CHECK_ERROR(
+        cuMemAllocPitch((CUdeviceptr *)ppMem, pResultPitch, widthInBytes,
+                        height, static_cast<unsigned int>(elementSizeBytes)));
   } catch (ur_result_t error) {
     Result = error;
   } catch (...) {
 
@@ -203,8 +203,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
 
     int MaxNumActiveGroupsPerCU{0};
     UR_CHECK_ERROR(cuOccupancyMaxActiveBlocksPerMultiprocessor(
-        &MaxNumActiveGroupsPerCU, hKernel->get(), localWorkSize,
-        dynamicSharedMemorySize));
+        &MaxNumActiveGroupsPerCU, hKernel->get(),
+        static_cast<int>(localWorkSize), dynamicSharedMemorySize));
     detail::ur::assertion(MaxNumActiveGroupsPerCU >= 0);
     // Handle the case where we can't have all SMs active with at least 1 group
     // per SM. In that case, the device is still able to run 1 work-group, hence