Skip to content

Commit 5440805

Browse files
authored
Merge pull request #3 from bogdanadnan/dev
Dev
2 parents 550357c + 1585cc1 commit 5440805

File tree

9 files changed

+656
-66
lines changed

9 files changed

+656
-66
lines changed

hash/gpu/cuda/cuda_hasher.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -243,8 +243,9 @@ bool cuda_hasher::__setup_device_info(cuda_device_info *device, double intensity
243243
}
244244

245245
device->profile_info.threads = (uint32_t)(max_threads * intensity / 100.0);
246-
if(max_threads > 0 && device->profile_info.threads == 0 && intensity > 0)
247-
device->profile_info.threads = 1;
246+
device->profile_info.threads = (device->profile_info.threads / 2) * 2; // make it divisible by 2
247+
if(max_threads > 0 && device->profile_info.threads == 0 && intensity > 0)
248+
device->profile_info.threads = 2;
248249

249250
chunks = (double)device->profile_info.threads / (double)device->profile_info.threads_per_chunk;
250251

hash/gpu/cuda/cuda_kernel.cu

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -553,17 +553,20 @@ __global__ void fill_blocks(uint32_t *scratchpad0,
553553
__global__ void prehash (
554554
uint32_t *preseed,
555555
uint32_t *seed) {
556-
extern __shared__ uint32_t shared[]; // size = 4 * 88
556+
extern __shared__ uint32_t shared[]; // size = 8 * 88
557557

558-
int hash = blockIdx.x;
559-
int id = threadIdx.x; // 16 threads
558+
int hash = blockIdx.x * 2;
559+
int id = threadIdx.x; // 32 threads
560+
int hash_idx = id >> 4;
561+
hash += hash_idx;
562+
id = id & 0xF;
560563

561564
int thr_id = id % 4; // thread id in session
562565
int session = id / 4; // 4 blake2b hashing session
563566
int lane = session / 2; // 2 lanes
564567
int idx = session % 2; // idx in lane
565568

566-
uint32_t *local_mem = &shared[session * BLAKE_SHARED_MEM_UINT];
569+
uint32_t *local_mem = &shared[(hash_idx * 4 + session) * BLAKE_SHARED_MEM_UINT];
567570
uint32_t *local_preseed = preseed + hash * IXIAN_SEED_SIZE_UINT;
568571
uint32_t *local_seed = seed + (hash * 4 + session) * BLOCK_SIZE_UINT;
569572

@@ -891,7 +894,7 @@ bool cuda_kernel_prehasher(void *memory, int threads, argon2profile *profile, vo
891894
cuda_device_info *device = gpumgmt_thread->device;
892895
cudaStream_t stream = (cudaStream_t)gpumgmt_thread->device_data;
893896

894-
size_t work_items = 8 * profile->thr_cost;
897+
size_t work_items = 32;
895898

896899
gpumgmt_thread->lock();
897900

@@ -902,7 +905,7 @@ bool cuda_kernel_prehasher(void *memory, int threads, argon2profile *profile, vo
902905
return false;
903906
}
904907

905-
prehash <<<threads, work_items, 4 * BLAKE_SHARED_MEM, stream>>> (
908+
prehash <<<threads / 2, work_items, 8 * BLAKE_SHARED_MEM, stream>>> (
906909
device->arguments.preseed_memory[gpumgmt_thread->thread_id],
907910
device->arguments.seed_memory[gpumgmt_thread->thread_id]);
908911

hash/gpu/opencl/opencl_hasher.cpp

Lines changed: 140 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -165,10 +165,22 @@ bool opencl_hasher::__setup_device_info(opencl_device_info *device, double inten
165165
return false;
166166
}
167167

168-
device->kernel = clCreateKernel(device->program, "fill_blocks", &error);
168+
device->kernel_prehash = clCreateKernel(device->program, "prehash", &error);
169169
if(error != CL_SUCCESS) {
170170
device->error = error;
171-
device->error_message = "Error creating opencl kernel for device.";
171+
device->error_message = "Error creating opencl prehash kernel for device.";
172+
return false;
173+
}
174+
device->kernel_fill_blocks = clCreateKernel(device->program, "fill_blocks", &error);
175+
if(error != CL_SUCCESS) {
176+
device->error = error;
177+
device->error_message = "Error creating opencl main kernel for device.";
178+
return false;
179+
}
180+
device->kernel_posthash = clCreateKernel(device->program, "posthash", &error);
181+
if(error != CL_SUCCESS) {
182+
device->error = error;
183+
device->error_message = "Error creating opencl posthash kernel for device.";
172184
return false;
173185
}
174186

@@ -321,28 +333,56 @@ bool opencl_hasher::__setup_device_info(opencl_device_info *device, double inten
321333
return false;
322334
}
323335

324-
device->arguments.seed_memory[0] = clCreateBuffer(device->context, CL_MEM_READ_ONLY, max_threads * 4 * ARGON2_BLOCK_SIZE, NULL, &error);
336+
device->arguments.preseed_memory[0] = clCreateBuffer(device->context, CL_MEM_READ_ONLY, device->profile_info.threads * IXIAN_SEED_SIZE, NULL, &error);
337+
if(error != CL_SUCCESS) {
338+
device->error = error;
339+
device->error_message = "Error creating memory buffer.";
340+
return false;
341+
}
342+
343+
device->arguments.preseed_memory[1] = clCreateBuffer(device->context, CL_MEM_READ_ONLY, device->profile_info.threads * IXIAN_SEED_SIZE, NULL, &error);
344+
if(error != CL_SUCCESS) {
345+
device->error = error;
346+
device->error_message = "Error creating memory buffer.";
347+
return false;
348+
}
349+
350+
device->arguments.seed_memory[0] = clCreateBuffer(device->context, CL_MEM_READ_WRITE, device->profile_info.threads * 4 * ARGON2_BLOCK_SIZE, NULL, &error);
351+
if(error != CL_SUCCESS) {
352+
device->error = error;
353+
device->error_message = "Error creating memory buffer.";
354+
return false;
355+
}
356+
357+
device->arguments.seed_memory[1] = clCreateBuffer(device->context, CL_MEM_READ_WRITE, device->profile_info.threads * 4 * ARGON2_BLOCK_SIZE, NULL, &error);
358+
if(error != CL_SUCCESS) {
359+
device->error = error;
360+
device->error_message = "Error creating memory buffer.";
361+
return false;
362+
}
363+
364+
device->arguments.out_memory[0] = clCreateBuffer(device->context, CL_MEM_READ_WRITE, device->profile_info.threads * ARGON2_BLOCK_SIZE, NULL, &error);
325365
if(error != CL_SUCCESS) {
326366
device->error = error;
327367
device->error_message = "Error creating memory buffer.";
328368
return false;
329369
}
330370

331-
device->arguments.seed_memory[1] = clCreateBuffer(device->context, CL_MEM_READ_ONLY, max_threads * 4 * ARGON2_BLOCK_SIZE, NULL, &error);
371+
device->arguments.out_memory[1] = clCreateBuffer(device->context, CL_MEM_READ_WRITE, device->profile_info.threads * ARGON2_BLOCK_SIZE, NULL, &error);
332372
if(error != CL_SUCCESS) {
333373
device->error = error;
334374
device->error_message = "Error creating memory buffer.";
335375
return false;
336376
}
337377

338-
device->arguments.out_memory[0] = clCreateBuffer(device->context, CL_MEM_WRITE_ONLY, max_threads * 4 * ARGON2_BLOCK_SIZE, NULL, &error);
378+
device->arguments.hash_memory[0] = clCreateBuffer(device->context, CL_MEM_WRITE_ONLY, device->profile_info.threads * ARGON2_RAW_LENGTH, NULL, &error);
339379
if(error != CL_SUCCESS) {
340380
device->error = error;
341381
device->error_message = "Error creating memory buffer.";
342382
return false;
343383
}
344384

345-
device->arguments.out_memory[1] = clCreateBuffer(device->context, CL_MEM_WRITE_ONLY, max_threads * 4 * ARGON2_BLOCK_SIZE, NULL, &error);
385+
device->arguments.hash_memory[1] = clCreateBuffer(device->context, CL_MEM_WRITE_ONLY, device->profile_info.threads * ARGON2_RAW_LENGTH, NULL, &error);
346386
if(error != CL_SUCCESS) {
347387
device->error = error;
348388
device->error_message = "Error creating memory buffer.";
@@ -381,15 +421,15 @@ bool opencl_hasher::__setup_device_info(opencl_device_info *device, double inten
381421
}
382422
free(segments);
383423

384-
clSetKernelArg(device->kernel, 0, sizeof(device->arguments.memory_chunk_0), &device->arguments.memory_chunk_0);
385-
clSetKernelArg(device->kernel, 1, sizeof(device->arguments.memory_chunk_1), &device->arguments.memory_chunk_1);
386-
clSetKernelArg(device->kernel, 2, sizeof(device->arguments.memory_chunk_2), &device->arguments.memory_chunk_2);
387-
clSetKernelArg(device->kernel, 3, sizeof(device->arguments.memory_chunk_3), &device->arguments.memory_chunk_3);
388-
clSetKernelArg(device->kernel, 4, sizeof(device->arguments.memory_chunk_4), &device->arguments.memory_chunk_4);
389-
clSetKernelArg(device->kernel, 5, sizeof(device->arguments.memory_chunk_5), &device->arguments.memory_chunk_5);
390-
clSetKernelArg(device->kernel, 8, sizeof(device->arguments.address), &device->arguments.address);
391-
clSetKernelArg(device->kernel, 9, sizeof(device->arguments.segments), &device->arguments.segments);
392-
clSetKernelArg(device->kernel, 10, sizeof(int32_t), &device->profile_info.threads_per_chunk);
424+
clSetKernelArg(device->kernel_fill_blocks, 0, sizeof(device->arguments.memory_chunk_0), &device->arguments.memory_chunk_0);
425+
clSetKernelArg(device->kernel_fill_blocks, 1, sizeof(device->arguments.memory_chunk_1), &device->arguments.memory_chunk_1);
426+
clSetKernelArg(device->kernel_fill_blocks, 2, sizeof(device->arguments.memory_chunk_2), &device->arguments.memory_chunk_2);
427+
clSetKernelArg(device->kernel_fill_blocks, 3, sizeof(device->arguments.memory_chunk_3), &device->arguments.memory_chunk_3);
428+
clSetKernelArg(device->kernel_fill_blocks, 4, sizeof(device->arguments.memory_chunk_4), &device->arguments.memory_chunk_4);
429+
clSetKernelArg(device->kernel_fill_blocks, 5, sizeof(device->arguments.memory_chunk_5), &device->arguments.memory_chunk_5);
430+
clSetKernelArg(device->kernel_fill_blocks, 8, sizeof(device->arguments.address), &device->arguments.address);
431+
clSetKernelArg(device->kernel_fill_blocks, 9, sizeof(device->arguments.segments), &device->arguments.segments);
432+
clSetKernelArg(device->kernel_fill_blocks, 10, sizeof(int32_t), &device->profile_info.threads_per_chunk);
393433

394434
return true;
395435
}
@@ -605,68 +645,112 @@ struct opencl_gpumgmt_thread_data {
605645
opencl_device_info *device;
606646
};
607647

648+
bool opencl_kernel_prehasher(void *memory, int threads, argon2profile *profile, void *user_data) {
649+
opencl_gpumgmt_thread_data *gpumgmt_thread = (opencl_gpumgmt_thread_data *)user_data;
650+
opencl_device_info *device = gpumgmt_thread->device;
651+
652+
cl_int error;
653+
654+
size_t total_work_items = threads * 8 * profile->thr_cost;
655+
size_t local_work_items = 8 * profile->thr_cost;
656+
657+
device->device_lock.lock();
658+
659+
error = clEnqueueWriteBuffer(device->queue, device->arguments.preseed_memory[gpumgmt_thread->thread_id], CL_FALSE, 0, threads * IXIAN_SEED_SIZE, memory, 0, NULL, NULL);
660+
if (error != CL_SUCCESS) {
661+
device->error = error;
662+
device->error_message = "Error writing to gpu memory.";
663+
device->device_lock.unlock();
664+
return false;
665+
}
666+
667+
clSetKernelArg(device->kernel_prehash, 0, sizeof(device->arguments.preseed_memory[gpumgmt_thread->thread_id]), &device->arguments.preseed_memory[gpumgmt_thread->thread_id]);
668+
clSetKernelArg(device->kernel_prehash, 1, sizeof(device->arguments.seed_memory[gpumgmt_thread->thread_id]), &device->arguments.seed_memory[gpumgmt_thread->thread_id]);
669+
clSetKernelArg(device->kernel_prehash, 2, 4 * sizeof(cl_ulong) * 60, NULL);
670+
671+
error=clEnqueueNDRangeKernel(device->queue, device->kernel_prehash, 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL);
672+
if(error != CL_SUCCESS) {
673+
device->error = error;
674+
device->error_message = "Error running the kernel.";
675+
device->device_lock.unlock();
676+
return false;
677+
}
678+
679+
return true;
680+
}
681+
608682
void *opencl_kernel_filler(void *memory, int threads, argon2profile *profile, void *user_data) {
609-
// uint64_t start_log = microseconds();
610-
// printf("Waiting for lock: %lld\n", microseconds() - start_log);
611-
// start_log = microseconds();
612683
opencl_gpumgmt_thread_data *gpumgmt_thread = (opencl_gpumgmt_thread_data *)user_data;
613684
opencl_device_info *device = gpumgmt_thread->device;
614685

615686
cl_int error;
616687

617-
int mem_seed_count = profile->thr_cost;
618688
size_t total_work_items = threads * KERNEL_WORKGROUP_SIZE * profile->thr_cost;
619689
size_t local_work_items = KERNEL_WORKGROUP_SIZE * profile->thr_cost;
620690

621-
device->device_lock.lock();
691+
clSetKernelArg(device->kernel_fill_blocks, 6, sizeof(device->arguments.seed_memory[gpumgmt_thread->thread_id]), &device->arguments.seed_memory[gpumgmt_thread->thread_id]);
692+
clSetKernelArg(device->kernel_fill_blocks, 7, sizeof(device->arguments.out_memory[gpumgmt_thread->thread_id]), &device->arguments.out_memory[gpumgmt_thread->thread_id]);
693+
error=clEnqueueNDRangeKernel(device->queue, device->kernel_fill_blocks, 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL);
694+
if(error != CL_SUCCESS) {
695+
device->error = error;
696+
device->error_message = "Error running the kernel.";
697+
device->device_lock.unlock();
698+
return NULL;
699+
}
622700

623-
error = clEnqueueWriteBuffer(device->queue, device->arguments.seed_memory[gpumgmt_thread->thread_id], CL_FALSE, 0, threads * 2 * mem_seed_count * ARGON2_BLOCK_SIZE, memory, 0, NULL, NULL);
624-
if (error != CL_SUCCESS) {
625-
device->error = error;
626-
device->error_message = "Error writing to gpu memory.";
627-
device->device_lock.unlock();
628-
return NULL;
629-
}
701+
return memory;
702+
}
703+
704+
bool opencl_kernel_posthasher(void *memory, int threads, argon2profile *profile, void *user_data) {
705+
opencl_gpumgmt_thread_data *gpumgmt_thread = (opencl_gpumgmt_thread_data *)user_data;
706+
opencl_device_info *device = gpumgmt_thread->device;
707+
708+
cl_int error;
709+
710+
size_t total_work_items = threads * 4;
711+
size_t local_work_items = 4;
630712

631-
clSetKernelArg(device->kernel, 6, sizeof(device->arguments.seed_memory[gpumgmt_thread->thread_id]), &device->arguments.seed_memory[gpumgmt_thread->thread_id]);
632-
clSetKernelArg(device->kernel, 7, sizeof(device->arguments.out_memory[gpumgmt_thread->thread_id]), &device->arguments.out_memory[gpumgmt_thread->thread_id]);
633-
error=clEnqueueNDRangeKernel(device->queue, device->kernel, 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL);
713+
clSetKernelArg(device->kernel_posthash, 0, sizeof(device->arguments.hash_memory[gpumgmt_thread->thread_id]), &device->arguments.hash_memory[gpumgmt_thread->thread_id]);
714+
clSetKernelArg(device->kernel_posthash, 1, sizeof(device->arguments.out_memory[gpumgmt_thread->thread_id]), &device->arguments.out_memory[gpumgmt_thread->thread_id]);
715+
clSetKernelArg(device->kernel_posthash, 2, sizeof(cl_ulong) * 60, NULL);
716+
717+
error=clEnqueueNDRangeKernel(device->queue, device->kernel_posthash, 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL);
634718
if(error != CL_SUCCESS) {
635719
device->error = error;
636720
device->error_message = "Error running the kernel.";
637721
device->device_lock.unlock();
638-
return NULL;
722+
return false;
639723
}
640724

641-
error = clEnqueueReadBuffer(device->queue, device->arguments.out_memory[gpumgmt_thread->thread_id], CL_FALSE, 0, threads * 2 * mem_seed_count * ARGON2_BLOCK_SIZE, memory, 0, NULL, NULL);
642-
if (error != CL_SUCCESS) {
643-
device->error = error;
644-
device->error_message = "Error reading gpu memory.";
645-
device->device_lock.unlock();
646-
return NULL;
647-
}
648-
649-
error=clFinish(device->queue);
725+
error = clEnqueueReadBuffer(device->queue, device->arguments.hash_memory[gpumgmt_thread->thread_id], CL_FALSE, 0, threads * ARGON2_RAW_LENGTH, memory, 0, NULL, NULL);
726+
if (error != CL_SUCCESS) {
727+
device->error = error;
728+
device->error_message = "Error reading gpu memory.";
729+
device->device_lock.unlock();
730+
return false;
731+
}
732+
733+
error=clFinish(device->queue);
650734
if(error != CL_SUCCESS) {
651735
device->error = error;
652736
device->error_message = "Error flushing GPU queue.";
653737
device->device_lock.unlock();
654-
return NULL;
738+
return false;
655739
}
656740

657741
device->device_lock.unlock();
658742

659-
return memory;
743+
return true;
660744
}
661745

662746
void opencl_hasher::__run(opencl_device_info *device, int thread_id) {
663-
void *memory = malloc(4 * ARGON2_BLOCK_SIZE * device->profile_info.threads);
747+
void *memory = malloc(IXIAN_SEED_SIZE * device->profile_info.threads);
664748

665749
opencl_gpumgmt_thread_data thread_data;
666750
thread_data.device = device;
667751
thread_data.thread_id = thread_id;
668752

669-
argon2 hash_factory(NULL, opencl_kernel_filler, NULL, memory, &thread_data);
753+
argon2 hash_factory(opencl_kernel_prehasher, opencl_kernel_filler, opencl_kernel_posthasher, memory, &thread_data);
670754
hash_factory.set_lane_length(2);
671755

672756
while(__running) {
@@ -721,12 +805,18 @@ void opencl_hasher::cleanup() {
721805
clReleaseMemObject((*it)->arguments.memory_chunk_5);
722806
clReleaseMemObject((*it)->arguments.address);
723807
clReleaseMemObject((*it)->arguments.segments);
724-
clReleaseMemObject((*it)->arguments.seed_memory[0]);
725-
clReleaseMemObject((*it)->arguments.seed_memory[1]);
726-
clReleaseMemObject((*it)->arguments.out_memory[0]);
727-
clReleaseMemObject((*it)->arguments.out_memory[1]);
728-
729-
clReleaseKernel((*it)->kernel);
808+
clReleaseMemObject((*it)->arguments.preseed_memory[0]);
809+
clReleaseMemObject((*it)->arguments.preseed_memory[1]);
810+
clReleaseMemObject((*it)->arguments.seed_memory[0]);
811+
clReleaseMemObject((*it)->arguments.seed_memory[1]);
812+
clReleaseMemObject((*it)->arguments.out_memory[0]);
813+
clReleaseMemObject((*it)->arguments.out_memory[1]);
814+
clReleaseMemObject((*it)->arguments.hash_memory[0]);
815+
clReleaseMemObject((*it)->arguments.hash_memory[1]);
816+
817+
clReleaseKernel((*it)->kernel_prehash);
818+
clReleaseKernel((*it)->kernel_fill_blocks);
819+
clReleaseKernel((*it)->kernel_posthash);
730820
clReleaseProgram((*it)->program);
731821
clReleaseCommandQueue((*it)->queue);
732822
clReleaseContext((*it)->context);

hash/gpu/opencl/opencl_hasher.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@ struct opencl_kernel_arguments {
2424
cl_mem memory_chunk_5;
2525
cl_mem address;
2626
cl_mem segments;
27+
cl_mem preseed_memory[2];
2728
cl_mem seed_memory[2];
2829
cl_mem out_memory[2];
30+
cl_mem hash_memory[2];
2931
};
3032

3133
struct argon2profile_info {
@@ -45,7 +47,9 @@ struct opencl_device_info {
4547
cl_command_queue queue;
4648

4749
cl_program program;
48-
cl_kernel kernel;
50+
cl_kernel kernel_prehash;
51+
cl_kernel kernel_fill_blocks;
52+
cl_kernel kernel_posthash;
4953

5054
int device_index;
5155

0 commit comments

Comments
 (0)