eyalroz
diff --git a/‎examples/by_api_module/context_management.cpp‎
Lines changed: 5 additions & 5 deletions b/‎examples/by_api_module/context_management.cpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/by_api_module/execution_control.cu‎
Lines changed: 1 addition & 0 deletions b/‎examples/by_api_module/execution_control.cu‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/by_api_module/module_management.cpp‎
Lines changed: 13 additions & 13 deletions b/‎examples/by_api_module/module_management.cpp‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎examples/by_api_module/unified_addressing.cpp‎
Lines changed: 2 additions & 4 deletions b/‎examples/by_api_module/unified_addressing.cpp‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎examples/common.hpp‎
Lines changed: 25 additions & 6 deletions b/‎examples/common.hpp‎
Lines changed: 25 additions & 6 deletions
diff --git a/‎examples/modified_cuda_samples/memMapIPCDrv/helper_multiprocess.cpp‎
Lines changed: 2 additions & 2 deletions b/‎examples/modified_cuda_samples/memMapIPCDrv/helper_multiprocess.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/modified_cuda_samples/memMapIPCDrv/helper_multiprocess.h‎
Lines changed: 1 addition & 1 deletion b/‎examples/modified_cuda_samples/memMapIPCDrv/helper_multiprocess.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu‎
Lines changed: 4 additions & 4 deletions b/‎examples/modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/modified_cuda_samples/streamOrderedAllocationIPC/helper_multiprocess.cpp‎
Lines changed: 2 additions & 2 deletions b/‎examples/modified_cuda_samples/streamOrderedAllocationIPC/helper_multiprocess.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/modified_cuda_samples/streamOrderedAllocationIPC/helper_multiprocess.h‎
Lines changed: 1 addition & 1 deletion b/‎examples/modified_cuda_samples/streamOrderedAllocationIPC/helper_multiprocess.h‎
Lines changed: 1 addition & 1 deletion
@@ -72,7 +72,7 @@ void test_context(
 	auto printf_fifo_size = context.get_limit(CU_LIMIT_PRINTF_FIFO_SIZE);
 	std::cout << "The printf FIFO size for context " << context << " is " << printf_fifo_size << ".\n";
 	decltype(printf_fifo_size) new_printf_fifo_size =
-		(printf_fifo_size <= 1024) ?  2 * printf_fifo_size : printf_fifo_size - 512;
+		(printf_fifo_size <= 1024) ? 2 * printf_fifo_size : printf_fifo_size - 512;
 	context.set_limit(CU_LIMIT_PRINTF_FIFO_SIZE, new_printf_fifo_size);
 	printf_fifo_size = context.get_limit(CU_LIMIT_PRINTF_FIFO_SIZE);
 	assert_(printf_fifo_size == new_printf_fifo_size);
@@ -91,9 +91,9 @@ void test_context(
 }
 
 void current_context_manipulation(
-	cuda::device_t &device,
-	cuda::device::primary_context_t &pc,
-	cuda::context_t &created_context)
+	const cuda::device_t &device,
+	const cuda::device::primary_context_t &pc,
+	const cuda::context_t &created_context)
 {
 	cuda::context_t context_0 = pc;
 	cuda::context_t context_1 = created_context;
@@ -104,13 +104,13 @@ void current_context_manipulation(
 	assert_(cuda::context::current::get() == context_1);
 	assert_(cuda::context::current::detail_::get_handle() == context_1.handle());
 
-
 	auto context_2 = cuda::context::create(device);
 	{
 		cuda::context::current::scoped_override_t context_for_this_block { context_2 };
 		assert_(context_2.handle() == cuda::context::current::get().handle());
 		assert_(context_2 == cuda::context::current::get());
 	}
+	(void) context_2; // We want it in existence outside the inner scope
 	auto gotten = cuda::context::current::get();
 	assert_(gotten == context_1);
 
 
@@ -115,6 +115,7 @@ int main(int argc, char **argv)
 		launch_config_4 = std::move(launch_config_3);
 		cuda::launch_configuration_t launch_config_5{std::move(launch_config_2)};
 		(void) launch_config_4;
+		(void) launch_config_5;
 		// In case the `[[maybe_unused]]` attribute and the void-casting is ignored,
 		// let's try to trick the compiler
     	// into thinking we're actually using launch_config_4.
 
@@ -84,8 +84,8 @@ std::string make_instantiation_name(string_view base_name, Ts&&... args)
 }
 
 void handle_compilation_failure(
-	const cuda::rtc::compilation_output_t<cuda::cuda_cpp>& compilation_output,
-	cuda::rtc::compilation_options_t<cuda::cuda_cpp> compilation_options = {})
+	const cuda::rtc::compilation_output_t<cuda::cuda_cpp> & compilation_output,
+	const cuda::rtc::compilation_options_t<cuda::cuda_cpp> & compilation_options = {})
 {
 	std::cerr << "Program compilation failed:\n";
 	auto compilation_log = compilation_output.log();
@@ -108,16 +108,16 @@ get_compiled_program(const cuda::device_t &device)
 __constant__ int a;
 
 __global__
-void my_kernel1(float const* indata, float* outdata) {
-    outdata[0] = indata[0] + 1;
-    outdata[0] -= 1;
+void my_kernel1(float const* in_data, float* out_data) {
+    out_data[0] = indata[0] + 1;
+    out_data[0] -= 1;
 }
 
 template<int C, typename T>
 __global__
-void my_kernel2(float const* indata, float* outdata) {
+void my_kernel2(float const* in_data, float* out_data) {
     for( int i=0; i<C; ++i ) {
-        outdata[0] =-indata[0];
+        out_data[0] =-in_data[0];
     }
 };
 
@@ -165,10 +165,10 @@ bool basic_module_tests(
 	module_kernels = std::move(module_kernels_);
 #endif
 
-	test_result = test_result and (module.device_id() == device.id());
-	test_result = test_result and (module.device() == device);
-	test_result = test_result and (module.context() == device.primary_context(cuda::do_not_hold_primary_context_refcount_unit));
-	test_result = test_result and (module.context_handle() == cuda::device::primary_context::detail_::get_handle(device.id()));
+	test_result = test_result and module.device_id() == device.id();
+	test_result = test_result and module.device() == device;
+	test_result = test_result and module.context() == device.primary_context(cuda::do_not_hold_primary_context_refcount_unit);
+	test_result = test_result and module.context_handle() == cuda::device::primary_context::detail_::get_handle(device.id());
 
 	{
 		auto a = module.get_global_region(compilation_result.get_mangling_of(constant_name));
@@ -181,9 +181,9 @@ bool basic_module_tests(
 	auto my_kernel2 = module.get_kernel(mangled_kernel_names[1]);
 
 	auto list_kernel =
-		[](const char * title, const char * mangled_name, cuda::optional<const char*> unmangled) {
+		[](const char * title_, const char * mangled_name, cuda::optional<const char*> unmangled) {
 			std::cout
-				<< title << ":\n"
+				<< title_ << ":\n"
 				<< "  unmangled: " << unmangled.value_or("N/A") << '\n'
 				<< "  mangled:   " << mangled_name << "\n"
 #if __GNUC__
 
@@ -58,9 +58,7 @@ void pointer_properties(const cuda::device_t& device)
 			(void) host_ptr; // Some compilers don't respect [[maybe_unused]] :-(
 			die_("Was expecting the host_ptr() method to fail for a device-side pointer");
 		} catch(cuda::runtime_error& e) {
-			if (e.code() != cuda::status::named_t::invalid_value) {
-				throw e;
-			}
+			if (e.code() != cuda::status::named_t::invalid_value) { throw; }
 		}
 		auto ptr_reported_as_managed = cuda::memory::pointer::detail_::get_attribute<CU_POINTER_ATTRIBUTE_IS_MANAGED>(raw_pointers[i]);
 		assert_(ptr_reported_as_managed == 0);
@@ -140,7 +138,7 @@ void wrapped_pointers_and_regions(const cuda::device_t& device)
 				<< ptr.get() << ", " << " host-side pointer: " << host_side_ptr;
 	}
 	catch(cuda::runtime_error& e) {
-		if (e.code() != cuda::status::invalid_value) { throw e; }
+		if (e.code() != cuda::status::invalid_value) { throw; }
 	}
 }
 
 
@@ -40,7 +40,7 @@ bool your_type_was_() { return true; }
 
 inline const char* ordinal_suffix(int n)
 {
-	static const char suffixes [4][5] = {"th", "st", "nd", "rd"};
+	static constexpr char suffixes [4][5] = {"th", "st", "nd", "rd"};
 	auto ord = n % 100;
 	if (ord / 10 == 1) { ord = 0; }
 	ord = ord % 10;
@@ -95,12 +95,12 @@ std::ostream& operator<<(std::ostream& os, cuda::device::compute_capability_t cc
 
 std::ostream& operator<<(std::ostream& os, cuda::multiprocessor_cache_preference_t pref)
 {
-	return (os << cache_preference_name(pref));
+	return os << cache_preference_name(pref);
 }
 
 std::ostream& operator<<(std::ostream& os, cuda::context::host_thread_sync_scheduling_policy_t pref)
 {
-	return (os << host_thread_sync_scheduling_policy_name(pref));
+	return os << host_thread_sync_scheduling_policy_name(pref);
 }
 
 std::ostream& operator<<(std::ostream& os, cuda::context::handle_t handle)
@@ -195,7 +195,7 @@ void print_context_stack()
 		}
 		std::cout << '\n';
 	}
-	for (auto it = contexts.rbegin(); it != contexts.rend(); it++) {
+	for (auto it = contexts.rbegin(); it != contexts.rend(); ++it) {
 		cuda::context::current::detail_::push(*it);
 	}
 }
@@ -359,14 +359,33 @@ cuda::device::id_t choose_device(int argc, char ** argv)
 }
 
 #ifdef __GNUC__
+
+inline char const* describe_demangling_status(int status)
+{
+	switch (status) {
+	case 0: return "success";
+	case 1: return "A memory allocation failure occurred";
+	case 2: return "mangled_name is not a valid name under the C++ ABI mangling rules";
+	case 3: return "One of the arguments is invalid";
+	default: return "Unknown demangling status";
+	}
+}
+
 // Inefficient, but simple
 inline std::string demangle(const char *mangled_name)
 {
 	if (mangled_name == nullptr) { return nullptr; }
 	int status;
-	char *raw_demangled = abi::__cxa_demangle(mangled_name, 0 /* output buffer */, 0 /* length */, &status);
+	auto no_preallocated_output_buffer = nullptr;
+	auto dont_return_mangled_length = nullptr;
+	char *raw_demangled = abi::__cxa_demangle(
+		mangled_name,
+		no_preallocated_output_buffer,
+		dont_return_mangled_length,
+		&status);
 	if (raw_demangled == nullptr) {
-		throw std::runtime_error(std::string("Failed demangling \"") + mangled_name + '\"');
+		throw std::runtime_error(std::string("Failed demangling \"") + mangled_name + "\": "
+			+ describe_demangling_status(status));
 	}
 	std::string result { raw_demangled };
 	free(raw_demangled);
 
@@ -217,7 +217,7 @@ int ipcOpenSocket(ipcHandle *&handle) {
   return 0;
 }
 
-int ipcCloseSocket(ipcHandle *handle) {
+int ipcCloseSocket(const ipcHandle *handle) {
   if (!handle) {
     return -1;
   }
@@ -498,7 +498,7 @@ int ipcRecvShareableHandles(ipcHandle *handle,
   return 0;
 }
 
-int ipcCloseSocket(ipcHandle *handle) {
+int ipcCloseSocket(const ipcHandle *handle) {
   for (int i = 0; i < handle->hMailslot.size(); i++) {
     CloseHandle(handle->hMailslot[i]);
   }
 
@@ -106,7 +106,7 @@ int
 ipcOpenSocket(ipcHandle *&handle);
 
 int
-ipcCloseSocket(ipcHandle *handle);
+ipcCloseSocket(const ipcHandle *handle);
 
 int
 ipcRecvShareableHandles(ipcHandle *handle, std::vector<shared_allocation_handle_t>& shareableHandles);
 
@@ -44,7 +44,7 @@ namespace cg = cooperative_groups;
 #define THREADS_PER_BLOCK 512
 #define GRAPH_LAUNCH_ITERATIONS 3
 
-__global__ void reduce(float *inputVec, double *outputVec, size_t inputSize, size_t outputSize)
+__global__ void reduce(const float *inputVec, double *outputVec, size_t inputSize, size_t outputSize)
 {
 	__shared__ double tmp[THREADS_PER_BLOCK];
 
@@ -53,7 +53,7 @@ __global__ void reduce(float *inputVec, double *outputVec, size_t inputSize, siz
 
 	double temp_sum = 0.0;
 	for (int i = globaltid; i < inputSize; i += gridDim.x * blockDim.x) {
-		temp_sum += (double) inputVec[i];
+		temp_sum += static_cast<double>(inputVec[i]);
 	}
 	tmp[cta.thread_rank()] = temp_sum;
 
@@ -132,7 +132,7 @@ __global__ void reduceFinal(double *inputVec, double *result, size_t inputSize)
 }
 
 void init_input(cuda::span<float> a) {
-	auto generator = []() {  return static_cast<float>(rand() & 0xFF) / (float)RAND_MAX; };
+	auto generator = []() {  return static_cast<float>(rand() & 0xFF) / static_cast<float>(RAND_MAX); };
 	::std::generate_n(a.data(), a.size(), generator);
 }
 
@@ -144,7 +144,7 @@ void myRealHostNodeCallback(char const *graph_construction_mode, double result)
 
 void CUDART_CB myHostNodeCallback(void *type_erased_data)
 {
-	auto *data = reinterpret_cast<std::pair<const char*, double*>*>(type_erased_data);
+	auto *data = static_cast<std::pair<const char*, double*>*>(type_erased_data);
 	auto graph_construction_mode = data->first;
 	auto result = data->second;
 	myRealHostNodeCallback(graph_construction_mode, *result);
 
@@ -217,7 +217,7 @@ int ipcOpenSocket(ipcHandle *&handle) {
   return 0;
 }
 
-int ipcCloseSocket(ipcHandle *handle) {
+int ipcCloseSocket(const ipcHandle *handle) {
   if (!handle) {
     return -1;
   }
@@ -466,7 +466,7 @@ int ipcRecvShareableHandles(ipcHandle *handle,
   return 0;
 }
 
-int ipcCloseSocket(ipcHandle *handle) {
+int ipcCloseSocket(const ipcHandle *handle) {
   for (int i = 0; i < handle->hMailslot.size(); i++) {
     CloseHandle(handle->hMailslot[i]);
   }
 
@@ -103,7 +103,7 @@ int
 ipcOpenSocket(ipcHandle *&handle);
 
 int
-ipcCloseSocket(ipcHandle *handle);
+ipcCloseSocket(const ipcHandle *handle);
 
 int
 ipcRecvShareableHandles(ipcHandle *handle, std::vector<shared_pool_handle_t>& shareable_pool_handles);
Original file line number	Diff line number	Diff line change
`@@ -58,9 +58,7 @@ void pointer_properties(const cuda::device_t& device)`
`58`	`58`	`(void) host_ptr; // Some compilers don't respect [[maybe_unused]] :-(`
`59`	`59`	`die_("Was expecting the host_ptr() method to fail for a device-side pointer");`
`60`	`60`	`} catch(cuda::runtime_error& e) {`
`61`		`- if (e.code() != cuda::status::named_t::invalid_value) {`
`62`		`- throw e;`
`63`		`- }`
	`61`	`+ if (e.code() != cuda::status::named_t::invalid_value) { throw; }`
`64`	`62`	`}`
`65`	`63`	`auto ptr_reported_as_managed = cuda::memory::pointer::detail_::get_attribute<CU_POINTER_ATTRIBUTE_IS_MANAGED>(raw_pointers[i]);`
`66`	`64`	`assert_(ptr_reported_as_managed == 0);`
`@@ -140,7 +138,7 @@ void wrapped_pointers_and_regions(const cuda::device_t& device)`
`140`	`138`	`<< ptr.get() << ", " << " host-side pointer: " << host_side_ptr;`
`141`	`139`	`}`
`142`	`140`	`catch(cuda::runtime_error& e) {`
`143`		`- if (e.code() != cuda::status::invalid_value) { throw e; }`
	`141`	`+ if (e.code() != cuda::status::invalid_value) { throw; }`
`144`	`142`	`}`
`145`	`143`	`}`
`146`	`144`
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ bool your_type_was_() { return true; }`
`40`	`40`
`41`	`41`	`inline const char* ordinal_suffix(int n)`
`42`	`42`	`{`
`43`		`- static const char suffixes [4][5] = {"th", "st", "nd", "rd"};`
	`43`	`+ static constexpr char suffixes [4][5] = {"th", "st", "nd", "rd"};`
`44`	`44`	`auto ord = n % 100;`
`45`	`45`	`if (ord / 10 == 1) { ord = 0; }`
`46`	`46`	`ord = ord % 10;`
`@@ -95,12 +95,12 @@ std::ostream& operator<<(std::ostream& os, cuda::device::compute_capability_t cc`
`95`	`95`
`96`	`96`	`std::ostream& operator<<(std::ostream& os, cuda::multiprocessor_cache_preference_t pref)`
`97`	`97`	`{`
`98`		`- return (os << cache_preference_name(pref));`
	`98`	`+ return os << cache_preference_name(pref);`
`99`	`99`	`}`
`100`	`100`
`101`	`101`	`std::ostream& operator<<(std::ostream& os, cuda::context::host_thread_sync_scheduling_policy_t pref)`
`102`	`102`	`{`
`103`		`- return (os << host_thread_sync_scheduling_policy_name(pref));`
	`103`	`+ return os << host_thread_sync_scheduling_policy_name(pref);`
`104`	`104`	`}`
`105`	`105`
`106`	`106`	`std::ostream& operator<<(std::ostream& os, cuda::context::handle_t handle)`
`@@ -195,7 +195,7 @@ void print_context_stack()`
`195`	`195`	`}`
`196`	`196`	`std::cout << '\n';`
`197`	`197`	`}`
`198`		`- for (auto it = contexts.rbegin(); it != contexts.rend(); it++) {`
	`198`	`+ for (auto it = contexts.rbegin(); it != contexts.rend(); ++it) {`
`199`	`199`	`cuda::context::current::detail_::push(*it);`
`200`	`200`	`}`
`201`	`201`	`}`
`@@ -359,14 +359,33 @@ cuda::device::id_t choose_device(int argc, char ** argv)`
`359`	`359`	`}`
`360`	`360`
`361`	`361`	`#ifdef __GNUC__`
	`362`	`+`
	`363`	`+inline char const* describe_demangling_status(int status)`
	`364`	`+{`
	`365`	`+ switch (status) {`
	`366`	`+ case 0: return "success";`
	`367`	`+ case 1: return "A memory allocation failure occurred";`
	`368`	`+ case 2: return "mangled_name is not a valid name under the C++ ABI mangling rules";`
	`369`	`+ case 3: return "One of the arguments is invalid";`
	`370`	`+ default: return "Unknown demangling status";`
	`371`	`+ }`
	`372`	`+}`
	`373`	`+`
`362`	`374`	`// Inefficient, but simple`
`363`	`375`	`inline std::string demangle(const char *mangled_name)`
`364`	`376`	`{`
`365`	`377`	`if (mangled_name == nullptr) { return nullptr; }`
`366`	`378`	`int status;`
`367`		`- char raw_demangled = abi::__cxa_demangle(mangled_name, 0 / output buffer /, 0 / length */, &status);`
	`379`	`+ auto no_preallocated_output_buffer = nullptr;`
	`380`	`+ auto dont_return_mangled_length = nullptr;`
	`381`	`+ char *raw_demangled = abi::__cxa_demangle(`
	`382`	`+ mangled_name,`
	`383`	`+ no_preallocated_output_buffer,`
	`384`	`+ dont_return_mangled_length,`
	`385`	`+ &status);`
`368`	`386`	`if (raw_demangled == nullptr) {`
`369`		`- throw std::runtime_error(std::string("Failed demangling \"") + mangled_name + '\"');`
	`387`	`+ throw std::runtime_error(std::string("Failed demangling \"") + mangled_name + "\": "`
	`388`	`+ + describe_demangling_status(status));`
`370`	`389`	`}`
`371`	`390`	`std::string result { raw_demangled };`
`372`	`391`	`free(raw_demangled);`
Original file line number	Diff line number	Diff line change
`@@ -217,7 +217,7 @@ int ipcOpenSocket(ipcHandle *&handle) {`
`217`	`217`	`return 0;`
`218`	`218`	`}`
`219`	`219`
`220`		`-int ipcCloseSocket(ipcHandle *handle) {`
	`220`	`+int ipcCloseSocket(const ipcHandle *handle) {`
`221`	`221`	`if (!handle) {`
`222`	`222`	`return -1;`
`223`	`223`	`}`
`@@ -498,7 +498,7 @@ int ipcRecvShareableHandles(ipcHandle *handle,`
`498`	`498`	`return 0;`
`499`	`499`	`}`
`500`	`500`
`501`		`-int ipcCloseSocket(ipcHandle *handle) {`
	`501`	`+int ipcCloseSocket(const ipcHandle *handle) {`
`502`	`502`	`for (int i = 0; i < handle->hMailslot.size(); i++) {`
`503`	`503`	`CloseHandle(handle->hMailslot[i]);`
`504`	`504`	`}`
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ namespace cg = cooperative_groups;`
`44`	`44`	`#define THREADS_PER_BLOCK 512`
`45`	`45`	`#define GRAPH_LAUNCH_ITERATIONS 3`
`46`	`46`
`47`		`-__global__ void reduce(float inputVec, double outputVec, size_t inputSize, size_t outputSize)`
	`47`	`+__global__ void reduce(const float inputVec, double outputVec, size_t inputSize, size_t outputSize)`
`48`	`48`	`{`
`49`	`49`	`__shared__ double tmp[THREADS_PER_BLOCK];`
`50`	`50`
`@@ -53,7 +53,7 @@ __global__ void reduce(float inputVec, double outputVec, size_t inputSize, siz`
`53`	`53`
`54`	`54`	`double temp_sum = 0.0;`
`55`	`55`	`for (int i = globaltid; i < inputSize; i += gridDim.x * blockDim.x) {`
`56`		`- temp_sum += (double) inputVec[i];`
	`56`	`+ temp_sum += static_cast<double>(inputVec[i]);`
`57`	`57`	`}`
`58`	`58`	`tmp[cta.thread_rank()] = temp_sum;`
`59`	`59`
`@@ -132,7 +132,7 @@ __global__ void reduceFinal(double inputVec, double result, size_t inputSize)`
`132`	`132`	`}`
`133`	`133`
`134`	`134`	`void init_input(cuda::span<float> a) {`
`135`		`- auto generator = []() { return static_cast<float>(rand() & 0xFF) / (float)RAND_MAX; };`
	`135`	`+ auto generator = []() { return static_cast<float>(rand() & 0xFF) / static_cast<float>(RAND_MAX); };`
`136`	`136`	`::std::generate_n(a.data(), a.size(), generator);`
`137`	`137`	`}`
`138`	`138`
`@@ -144,7 +144,7 @@ void myRealHostNodeCallback(char const *graph_construction_mode, double result)`
`144`	`144`
`145`	`145`	`void CUDART_CB myHostNodeCallback(void *type_erased_data)`
`146`	`146`	`{`
`147`		`- auto data = reinterpret_cast<std::pair<const char, double>>(type_erased_data);`
	`147`	`+ auto data = static_cast<std::pair<const char, double>>(type_erased_data);`
`148`	`148`	`auto graph_construction_mode = data->first;`
`149`	`149`	`auto result = data->second;`
`150`	`150`	`myRealHostNodeCallback(graph_construction_mode, *result);`