Skip to content

Commit d301649

Browse files
committed
fix #4063 threading perf issues on Windows
1 parent 6414548 commit d301649

File tree

1 file changed

+40
-59
lines changed

1 file changed

+40
-59
lines changed

driver/others/blas_server_win32.c

Lines changed: 40 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,7 @@
5252

5353
/* Thread server common information */
5454
typedef struct{
55-
CRITICAL_SECTION lock;
56-
HANDLE filled;
57-
HANDLE killed;
55+
HANDLE taskSemaphore;
5856

5957
blas_queue_t *queue; /* Parameter Pointer */
6058
int shutdown; /* server shutdown flag */
@@ -68,6 +66,7 @@ int blas_server_avail = 0;
6866
static BLASULONG server_lock = 0;
6967

7068
static blas_pool_t pool;
69+
static BLASULONG pool_lock = 0;
7170
static HANDLE blas_threads [MAX_CPU_NUMBER];
7271
static DWORD blas_threads_id[MAX_CPU_NUMBER];
7372

@@ -198,7 +197,6 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
198197

199198
/* This is a main routine of threads. Each thread waits until job is */
200199
/* queued. */
201-
202200
static DWORD WINAPI blas_thread_server(void *arg){
203201

204202
/* Thread identifier */
@@ -207,9 +205,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
207205
#endif
208206

209207
void *buffer, *sa, *sb;
210-
blas_queue_t *queue;
211-
DWORD action;
212-
HANDLE handles[] = {pool.filled, pool.killed};
208+
volatile blas_queue_t *queue;
213209

214210
/* Each server needs each buffer */
215211
buffer = blas_memory_alloc(2);
@@ -226,28 +222,32 @@ static DWORD WINAPI blas_thread_server(void *arg){
226222
fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
227223
#endif
228224

229-
do {
230-
action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
231-
} while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
232-
233-
if (action == WAIT_OBJECT_0 + 1) break;
225+
// all worker threads wait on the semaphore
226+
WaitForSingleObject(pool.taskSemaphore, INFINITE);
234227

228+
// kill the thread if we are shutting down the server
229+
if (pool.shutdown)
230+
break;
231+
235232
#ifdef SMP_DEBUG
236233
fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
237234
#endif
238235

239-
EnterCriticalSection(&pool.lock);
236+
// grab a queued task and update the list
237+
volatile blas_queue_t* queue_next;
238+
LONG64 prev_value;
239+
do {
240+
queue = (volatile blas_queue_t*)pool.queue;
241+
if (!queue)
242+
break;
240243

241-
queue = pool.queue;
242-
if (queue) pool.queue = queue->next;
243-
244-
LeaveCriticalSection(&pool.lock);
244+
queue_next = (volatile blas_queue_t*)queue->next;
245+
prev_value = InterlockedCompareExchange64((PLONG64)&pool.queue, (LONG64)queue_next, (LONG64)queue);
246+
} while (prev_value != queue);
245247

246248
if (queue) {
247249
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
248250

249-
if (pool.queue) SetEvent(pool.filled);
250-
251251
sa = queue -> sa;
252252
sb = queue -> sb;
253253

@@ -332,13 +332,8 @@ static DWORD WINAPI blas_thread_server(void *arg){
332332
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
333333
#endif
334334

335-
EnterCriticalSection(&queue->lock);
336-
337-
queue -> status = BLAS_STATUS_FINISHED;
338-
339-
LeaveCriticalSection(&queue->lock);
340-
341-
SetEvent(queue->finish);
335+
// mark our sub-task as complete
336+
InterlockedDecrement(&queue->status);
342337
}
343338

344339
/* Shutdown procedure */
@@ -353,7 +348,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
353348
}
354349

355350
/* Initializing routine */
356-
int blas_thread_init(void){
351+
int blas_thread_init(void){
357352
BLASLONG i;
358353

359354
if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
@@ -367,9 +362,7 @@ int blas_thread_init(void){
367362

368363
if (!blas_server_avail){
369364

370-
InitializeCriticalSection(&pool.lock);
371-
pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
372-
pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL);
365+
pool.taskSemaphore = CreateSemaphore(NULL, 0, blas_cpu_number - 1, NULL);
373366

374367
pool.shutdown = 0;
375368
pool.queue = NULL;
@@ -391,11 +384,10 @@ int blas_thread_init(void){
391384
/*
392385
User can call one of two routines.
393386
394-
exec_blas_async ... immediately returns after jobs are queued.
387+
exec_blas_async ... immediately returns after jobs are queued.
395388
396-
exec_blas ... returns after jobs are finished.
389+
exec_blas ... returns after jobs are finished.
397390
*/
398-
399391
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
400392

401393
#if defined(SMP_SERVER)
@@ -409,8 +401,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
409401
current = queue;
410402

411403
while (current) {
412-
InitializeCriticalSection(&current -> lock);
413-
current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL);
404+
current->status = 1;
414405
current -> position = pos;
415406

416407
#ifdef CONSISTENT_FPCSR
@@ -422,19 +413,10 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
422413
pos ++;
423414
}
424415

425-
EnterCriticalSection(&pool.lock);
416+
pool.queue = queue;
426417

427-
if (pool.queue) {
428-
current = pool.queue;
429-
while (current -> next) current = current -> next;
430-
current -> next = queue;
431-
} else {
432-
pool.queue = queue;
433-
}
434-
435-
LeaveCriticalSection(&pool.lock);
436-
437-
SetEvent(pool.filled);
418+
// start up worker threads
419+
ReleaseSemaphore(pool.taskSemaphore, pos - 1, NULL);
438420

439421
return 0;
440422
}
@@ -450,10 +432,9 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
450432
fprintf(STDERR, "Waiting Queue ..\n");
451433
#endif
452434

453-
WaitForSingleObject(queue->finish, INFINITE);
454-
455-
CloseHandle(queue->finish);
456-
DeleteCriticalSection(&queue -> lock);
435+
// spin-wait on each sub-task to finish
436+
while (*((volatile int*)&queue->status))
437+
YIELDING;
457438

458439
queue = queue -> next;
459440
num --;
@@ -501,18 +482,21 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
501482

502483
/* Shutdown procedure, but user don't have to call this routine. The */
503484
/* kernel automatically kill threads. */
504-
505485
int BLASFUNC(blas_thread_shutdown)(void){
506486

507487
int i;
508488

489+
#ifdef SMP_DEBUG
490+
fprintf(STDERR, "blas_thread_shutdown..\n");
491+
#endif
492+
509493
if (!blas_server_avail) return 0;
510494

511495
LOCK_COMMAND(&server_lock);
512496

513497
if (blas_server_avail){
514498

515-
SetEvent(pool.killed);
499+
pool.shutdown = 1;
516500

517501
for(i = 0; i < blas_num_threads - 1; i++){
518502
// Could also just use WaitForMultipleObjects
@@ -528,8 +512,7 @@ int BLASFUNC(blas_thread_shutdown)(void){
528512
CloseHandle(blas_threads[i]);
529513
}
530514

531-
CloseHandle(pool.filled);
532-
CloseHandle(pool.killed);
515+
CloseHandle(pool.taskSemaphore);
533516

534517
blas_server_avail = 0;
535518
}
@@ -559,16 +542,14 @@ void goto_set_num_threads(int num_threads)
559542
//increased_threads = 1;
560543
if (!blas_server_avail){
561544

562-
InitializeCriticalSection(&pool.lock);
563-
pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
564-
pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL);
545+
pool.taskSemaphore = CreateSemaphore(NULL, 0, blas_cpu_number - 1, NULL);
565546

566547
pool.shutdown = 0;
567548
pool.queue = NULL;
568549
blas_server_avail = 1;
569550
}
570551

571-
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
552+
for(i = blas_num_threads; i < num_threads - 1; i++){
572553

573554
blas_threads[i] = CreateThread(NULL, 0,
574555
blas_thread_server, (void *)i,

0 commit comments

Comments
 (0)