diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index ec02425a..145a7ea3 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -47,13 +47,15 @@ HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) std::cout << "Memory: DEFAULT" << std::endl; #endif + hipDeviceProp_t props; + hipGetDeviceProperties(&props, device_index); + check_error(); + array_size = ARRAY_SIZE; - // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane) - dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane); + dot_num_blocks = props.multiProcessorCount * 4; size_t array_bytes = sizeof(T); array_bytes *= ARRAY_SIZE; - size_t total_bytes = array_bytes * 3; // Allocate the host array for partial sums for dot kernels using hipHostMalloc. // This creates an array on the host which is visible to the device. However, it requires @@ -63,9 +65,7 @@ HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) check_error(); // Check buffers fit on the device - hipDeviceProp_t props; - hipGetDeviceProperties(&props, 0); - if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T)) + if (props.totalGlobalMem < std::size_t{3}*array_bytes) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); // Create device buffers diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 76ef7df4..b437d2b7 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -14,27 +14,10 @@ #include "Stream.h" #define IMPLEMENTATION_STRING "HIP" -#define DOT_READ_DWORDS_PER_LANE 4 - template class HIPStream : public Stream { - // Make sure that either: - // DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element - // or - // DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T) - static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || - (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), - "DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)"); - - // Take into account the datatype size - // That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements - // and 4 FP32 elements - static constexpr unsigned int dot_elements_per_lane{ - (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( - DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; - protected: // Size of arrays intptr_t array_size;