Merge pull request #4 from bogdanadnan/dev

bogdanadnan · web-flow · commit 1a82efe72be9 · 2019-06-07T14:55:43.000+03:00
OpenCL optimizations.
diff --git a/hash/gpu/opencl/opencl_hasher.cpp b/hash/gpu/opencl/opencl_hasher.cpp
@@ -205,8 +205,9 @@ bool opencl_hasher::__setup_device_info(opencl_device_info *device, double inten
     }
 
     device->profile_info.threads = (uint32_t)(max_threads * intensity / 100.0);
+    device->profile_info.threads = (device->profile_info.threads / 4) * 4; // make it divisible by 4
     if(max_threads > 0 && device->profile_info.threads == 0 && intensity > 0)
-        device->profile_info.threads = 1;
+        device->profile_info.threads = 4;
 
     double counter = (double)device->profile_info.threads / (double)device->profile_info.threads_per_chunk;
     size_t allocated_mem_for_current_chunk = 0;
@@ -651,8 +652,8 @@ bool opencl_kernel_prehasher(void *memory, int threads, argon2profile *profile,
 
     cl_int error;
 
-    size_t total_work_items = threads * 8 * profile->thr_cost;
-    size_t local_work_items = 8 * profile->thr_cost;
+    size_t total_work_items = 64 * threads / 4;
+    size_t local_work_items = 64;
 
     device->device_lock.lock();
 
@@ -666,7 +667,7 @@ bool opencl_kernel_prehasher(void *memory, int threads, argon2profile *profile,
 
     clSetKernelArg(device->kernel_prehash, 0, sizeof(device->arguments.preseed_memory[gpumgmt_thread->thread_id]), &device->arguments.preseed_memory[gpumgmt_thread->thread_id]);
     clSetKernelArg(device->kernel_prehash, 1, sizeof(device->arguments.seed_memory[gpumgmt_thread->thread_id]), &device->arguments.seed_memory[gpumgmt_thread->thread_id]);
-    clSetKernelArg(device->kernel_prehash, 2, 4 * sizeof(cl_ulong) * 60, NULL);
+    clSetKernelArg(device->kernel_prehash, 2, 16 * sizeof(cl_ulong) * 60, NULL);
 
     error=clEnqueueNDRangeKernel(device->queue, device->kernel_prehash, 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL);
     if(error != CL_SUCCESS) {
diff --git a/hash/gpu/opencl/opencl_kernel.cpp b/hash/gpu/opencl/opencl_kernel.cpp
@@ -799,15 +799,19 @@ __kernel void prehash (
         __global uint *seed,
         __local ulong *blake_shared) {
 
-	int hash = get_group_id(0);
-	int id = get_local_id(0);
+    int hash = get_group_id(0) * 4;
+    int id = get_local_id(0); // 64 threads
+
+    int hash_idx = id >> 4;
+    hash += hash_idx;
+    id = id & 0xF;
 
     int thr_id = id % 4; // thread id in session
     int session = id / 4; // 4 blake2b hashing session
     int lane = session / 2;  // 2 lanes
     int idx = session % 2; // idx in lane
 
-    __local uint *local_mem = (__local uint *)&blake_shared[session * BLAKE_SHARED_MEM_ULONG];
+    __local uint *local_mem = (__local uint *)&blake_shared[(hash_idx * 4 + session) * BLAKE_SHARED_MEM_ULONG];
     __global uint *local_preseed = preseed + hash * IXIAN_SEED_SIZE_UINT;
     __global uint *local_seed = seed + (hash * 4 + session) * BLOCK_SIZE_UINT;