turboderp-org
diff --git a/‎exllamav3/exllamav3_ext/parallel/all_reduce.cu‎
Lines changed: 14 additions & 13 deletions b/‎exllamav3/exllamav3_ext/parallel/all_reduce.cu‎
Lines changed: 14 additions & 13 deletions
diff --git a/‎exllamav3/exllamav3_ext/parallel/all_reduce.cuh‎
Lines changed: 4 additions & 2 deletions b/‎exllamav3/exllamav3_ext/parallel/all_reduce.cuh‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎exllamav3/exllamav3_ext/parallel/all_reduce_cpu.cu‎
Lines changed: 9 additions & 11 deletions b/‎exllamav3/exllamav3_ext/parallel/all_reduce_cpu.cu‎
Lines changed: 9 additions & 11 deletions
diff --git a/‎exllamav3/exllamav3_ext/parallel/barrier.cu‎
Lines changed: 7 additions & 5 deletions b/‎exllamav3/exllamav3_ext/parallel/barrier.cu‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎exllamav3/exllamav3_ext/parallel/barrier.cuh‎
Lines changed: 2 additions & 1 deletion b/‎exllamav3/exllamav3_ext/parallel/barrier.cuh‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎exllamav3/exllamav3_ext/parallel/barrier_inner.cuh‎
Lines changed: 6 additions & 7 deletions b/‎exllamav3/exllamav3_ext/parallel/barrier_inner.cuh‎
Lines changed: 6 additions & 7 deletions
@@ -25,16 +25,14 @@ void pg_all_reduce_kernel
     uint8_t* __restrict__ data_ptr,
     uint8_t* __restrict__ shbuf_ptr,
     size_t data_size,
-    size_t shbuf_size
+    size_t shbuf_size,
+    uint32_t* abort_flag
 )
 {
     int t = threadIdx.x;
     auto grid = cg::this_grid();
 
-    __shared__ bool timeout;
     __shared__ bool r;
-    timeout = false;
-
     int dir = blockIdx.x;
 
     int num_ranks = __popc(device_mask);
@@ -79,8 +77,8 @@ void pg_all_reduce_kernel
             {
                 __nanosleep(sleep);
                 if (sleep < SYNC_MAX_SLEEP) sleep <<= 1;
-                else timeout = check_timeout(ctx, deadline, "all_reduce");
-                if (timeout) break;
+                else *abort_flag = check_timeout(ctx, deadline, "all_reduce");
+                if (*abort_flag) break;
             }
         }
         __syncthreads();
@@ -168,8 +166,8 @@ void pg_all_reduce_kernel
                 {
                     __nanosleep(sleep);
                     if (sleep < SYNC_MAX_SLEEP) sleep <<= 1;
-                    else timeout = check_timeout(ctx, deadline, "all_reduce (1)");
-                    if (timeout) break;
+                    else *abort_flag = check_timeout(ctx, deadline, "all_reduce (1)");
+                    if (*abort_flag) break;
                 }
             }
         }
@@ -204,21 +202,21 @@ void pg_all_reduce_kernel
                 {
                     __nanosleep(sleep);
                     if (sleep < SYNC_MAX_SLEEP) sleep <<= 1;
-                    else timeout = check_timeout(ctx, deadline, "all_reduce (2)");
-                    if (timeout) break;
+                    else *abort_flag = check_timeout(ctx, deadline, "all_reduce (2)");
+                    if (*abort_flag) break;
                 }
             }
 
             // Wait for destination to finish receiving
             wait_min_stage(ctx->reduce_stage_consumed + dst_rank, stage_end, deadline);
         }
 
-        if (timeout) break;
+        if (*abort_flag) break;
         grid.sync();
     }
 
     // Finished. Reset counters for next kernel
-    pg_barrier_inner(ctx, device_mask, this_device, master_device);
+    pg_barrier_inner(ctx, device_mask, this_device, master_device, abort_flag);
 
     if (t == 0)
     {
@@ -237,7 +235,8 @@ void pg_all_reduce
     int master_device,
     at::Tensor& tensor,
     uintptr_t shbuf,
-    size_t shbuf_size
+    size_t shbuf_size,
+    at::Tensor& abort_flag
 )
 {
     const at::cuda::OptionalCUDAGuard device_guard(this_device);
@@ -256,6 +255,7 @@ void pg_all_reduce
     int threads = (int) CEIL_DIVIDE(CEIL_DIVIDE(data_size / 16ll, num_ranks), 32ll) * 32ll;
     threads = MIN(threads, MAX_NUM_THREADS);
 
+    uint32_t* abort_flag_ptr = (uint32_t*) abort_flag.data_ptr();
     void* kernelArgs[] =
     {
         (void*)& ctx,
@@ -266,6 +266,7 @@ void pg_all_reduce
         (void*)& shbuf_ptr,
         (void*)& data_size,
         (void*)& shbuf_size,
+        (void*)& abort_flag_ptr
     };
 
     dim3 block_grid(2);
 
@@ -10,7 +10,8 @@ void pg_all_reduce
     int master_device,
     at::Tensor& tensor,
     uintptr_t shbuf,
-    size_t shbuf_size
+    size_t shbuf_size,
+    at::Tensor& abort_flag
 );
 
 void pg_all_reduce_cpu
@@ -23,7 +24,8 @@ void pg_all_reduce_cpu
     bool contributor,
     uintptr_t shbuf,
     size_t shbuf_size,
-    bool is_master
+    bool is_master,
+    at::Tensor& abort_flag
 );
 
 void run_cpu_reduce_jobs
 
@@ -296,8 +296,6 @@ void perform_cpu_reduce
     }
 }
 
-__device__ bool pg_all_reduce_cpu_kernel_timeout;
-
 #define PARCK_MODE_FLOAT 0
 #define PARCK_MODE_HALF 1
 #define PARCK_MODE_BF16 2
@@ -314,7 +312,8 @@ void pg_all_reduce_cpu_kernel
     uint8_t* __restrict__ shbuf_ptr,
     size_t data_size,
     size_t shbuf_size,
-    bool contributor
+    bool contributor,
+    uint32_t* abort_flag
 )
 {
     // Indexing
@@ -335,8 +334,6 @@ void pg_all_reduce_cpu_kernel
 
     int t = threadIdx.x;
     int dir = blockIdx.x;
-    if (t == 0)
-        pg_all_reduce_cpu_kernel_timeout = false;
     auto grid = cg::this_grid();
 
     // Get device stage
@@ -453,14 +450,12 @@ void pg_all_reduce_cpu_kernel
                     if (sleep < SYNC_MAX_SLEEP) sleep <<= 1;
                     else if (check_timeout(ctx, deadline, "pg_all_reduce_cpu_kernel"))
                     {
-                        DBGI2(ep, stage);
-                        to = true;
+                        *abort_flag = 1;
                         break;
                     }
                 }
             }
             __syncthreads();
-            if (to) pg_all_reduce_cpu_kernel_timeout = true;
 
             // Recv float
             if constexpr (dtype == PARCK_MODE_FLOAT)
@@ -512,7 +507,7 @@ void pg_all_reduce_cpu_kernel
         }
 
         grid.sync();
-        if (pg_all_reduce_cpu_kernel_timeout) break;
+        if (*abort_flag) break;
     }
 }
 
@@ -526,7 +521,8 @@ void pg_all_reduce_cpu
     bool contributor,
     uintptr_t shbuf,
     size_t shbuf_size,
-    bool is_master
+    bool is_master,
+    at::Tensor& abort_flag
 )
 {
     const at::cuda::OptionalCUDAGuard device_guard(this_device);
@@ -543,6 +539,7 @@ void pg_all_reduce_cpu
 
     TORCH_CHECK(cpu_data_size % 16 == 0, "data_size must be multiple of 16");
 
+    uint32_t* abort_flag_ptr = (uint32_t*) abort_flag.data_ptr();
     void* kernelArgs[] =
     {
         (void*)& ctx,
@@ -553,7 +550,8 @@ void pg_all_reduce_cpu
         (void*)& shbuf_ptr,
         (void*)& device_data_size,
         (void*)& shbuf_size,
-        (void*)& contributor
+        (void*)& contributor,
+        (void*)& abort_flag_ptr
     };
 
     dim3 block_grid(2);
 
@@ -14,18 +14,19 @@ __global__ void pg_barrier_kernel
     PGContext* __restrict__ ctx,
     uint32_t device_mask,
     int this_device,
-    int coordinator_device
+    int coordinator_device,
+    uint32_t* abort_flag
 )
 {
-    pg_barrier_inner(ctx, device_mask, this_device, coordinator_device);
+    pg_barrier_inner(ctx, device_mask, this_device, coordinator_device, abort_flag);
 }
 
-
 void pg_barrier
 (
     uintptr_t ctx,
     std::vector<uintptr_t> devices,
-    int this_device
+    int this_device,
+    at::Tensor& abort_flag
 )
 {
     const at::cuda::OptionalCUDAGuard device_guard(this_device);
@@ -40,7 +41,8 @@ void pg_barrier
         (PGContext*) ctx,  // Shared, pinned
         device_mask,
         this_device,
-        devices[0]
+        devices[0],
+        (uint32_t*) abort_flag.data_ptr()
     );
     cuda_check(cudaPeekAtLastError());
 }
@@ -7,5 +7,6 @@ void pg_barrier
 (
     uintptr_t ctx,
     std::vector<uintptr_t> devices,
-    int this_device
+    int this_device,
+    at::Tensor& abort_flag
 );
@@ -4,11 +4,10 @@ __device__ __forceinline__ void pg_barrier_inner
     PGContext* __restrict__ ctx,
     uint32_t device_mask,
     int this_device,
-    int coordinator_device
+    int coordinator_device,
+    uint32_t* abort_flag
 )
 {
-    bool timeout = false;
-
     if (!blockIdx.x && !blockIdx.y && !blockIdx.z && !threadIdx.x && !threadIdx.y && !threadIdx.z)
     {
         uint32_t* epoch_ptr     = &ctx->barrier_epoch;
@@ -47,8 +46,8 @@ __device__ __forceinline__ void pg_barrier_inner
                 {
                     __nanosleep(sleep);
                     if (sleep < SYNC_MAX_SLEEP) sleep <<= 1;
-                    else timeout = check_timeout(ctx, deadline, "barrier");
-                    if (timeout) break;
+                    else *abort_flag = check_timeout(ctx, deadline, "barrier");
+                    if (*abort_flag) break;
                 }
                 else sleep = SYNC_MIN_SLEEP;
             }
@@ -66,8 +65,8 @@ __device__ __forceinline__ void pg_barrier_inner
             {
                 __nanosleep(sleep);
                 if (sleep < SYNC_MAX_SLEEP) sleep <<= 1;
-                else timeout = check_timeout(ctx, deadline, "barrier");
-                if (timeout) break;
+                else *abort_flag = check_timeout(ctx, deadline, "barrier");
+                if (*abort_flag) break;
             }
         }
     }
Original file line number	Diff line number	Diff line change
`@@ -25,16 +25,14 @@ void pg_all_reduce_kernel`
`25`	`25`	`uint8_t* __restrict__ data_ptr,`
`26`	`26`	`uint8_t* __restrict__ shbuf_ptr,`
`27`	`27`	`size_t data_size,`
`28`		`- size_t shbuf_size`
	`28`	`+ size_t shbuf_size,`
	`29`	`+ uint32_t* abort_flag`
`29`	`30`	`)`
`30`	`31`	`{`
`31`	`32`	`int t = threadIdx.x;`
`32`	`33`	`auto grid = cg::this_grid();`
`33`	`34`
`34`		`- __shared__ bool timeout;`
`35`	`35`	`__shared__ bool r;`
`36`		`- timeout = false;`
`37`		`-`
`38`	`36`	`int dir = blockIdx.x;`
`39`	`37`
`40`	`38`	`int num_ranks = __popc(device_mask);`
`@@ -79,8 +77,8 @@ void pg_all_reduce_kernel`
`79`	`77`	`{`
`80`	`78`	`__nanosleep(sleep);`
`81`	`79`	`if (sleep < SYNC_MAX_SLEEP) sleep <<= 1;`
`82`		`- else timeout = check_timeout(ctx, deadline, "all_reduce");`
`83`		`- if (timeout) break;`
	`80`	`+ else *abort_flag = check_timeout(ctx, deadline, "all_reduce");`
	`81`	`+ if (*abort_flag) break;`
`84`	`82`	`}`
`85`	`83`	`}`
`86`	`84`	`__syncthreads();`
`@@ -168,8 +166,8 @@ void pg_all_reduce_kernel`
`168`	`166`	`{`
`169`	`167`	`__nanosleep(sleep);`
`170`	`168`	`if (sleep < SYNC_MAX_SLEEP) sleep <<= 1;`
`171`		`- else timeout = check_timeout(ctx, deadline, "all_reduce (1)");`
`172`		`- if (timeout) break;`
	`169`	`+ else *abort_flag = check_timeout(ctx, deadline, "all_reduce (1)");`
	`170`	`+ if (*abort_flag) break;`
`173`	`171`	`}`
`174`	`172`	`}`
`175`	`173`	`}`
`@@ -204,21 +202,21 @@ void pg_all_reduce_kernel`
`204`	`202`	`{`
`205`	`203`	`__nanosleep(sleep);`
`206`	`204`	`if (sleep < SYNC_MAX_SLEEP) sleep <<= 1;`
`207`		`- else timeout = check_timeout(ctx, deadline, "all_reduce (2)");`
`208`		`- if (timeout) break;`
	`205`	`+ else *abort_flag = check_timeout(ctx, deadline, "all_reduce (2)");`
	`206`	`+ if (*abort_flag) break;`
`209`	`207`	`}`
`210`	`208`	`}`
`211`	`209`
`212`	`210`	`// Wait for destination to finish receiving`
`213`	`211`	`wait_min_stage(ctx->reduce_stage_consumed + dst_rank, stage_end, deadline);`
`214`	`212`	`}`
`215`	`213`
`216`		`- if (timeout) break;`
	`214`	`+ if (*abort_flag) break;`
`217`	`215`	`grid.sync();`
`218`	`216`	`}`
`219`	`217`
`220`	`218`	`// Finished. Reset counters for next kernel`
`221`		`- pg_barrier_inner(ctx, device_mask, this_device, master_device);`
	`219`	`+ pg_barrier_inner(ctx, device_mask, this_device, master_device, abort_flag);`
`222`	`220`
`223`	`221`	`if (t == 0)`
`224`	`222`	`{`
`@@ -237,7 +235,8 @@ void pg_all_reduce`
`237`	`235`	`int master_device,`
`238`	`236`	`at::Tensor& tensor,`
`239`	`237`	`uintptr_t shbuf,`
`240`		`- size_t shbuf_size`
	`238`	`+ size_t shbuf_size,`
	`239`	`+ at::Tensor& abort_flag`
`241`	`240`	`)`
`242`	`241`	`{`
`243`	`242`	`const at::cuda::OptionalCUDAGuard device_guard(this_device);`
`@@ -256,6 +255,7 @@ void pg_all_reduce`
`256`	`255`	`int threads = (int) CEIL_DIVIDE(CEIL_DIVIDE(data_size / 16ll, num_ranks), 32ll) * 32ll;`
`257`	`256`	`threads = MIN(threads, MAX_NUM_THREADS);`
`258`	`257`
	`258`	`+ uint32_t* abort_flag_ptr = (uint32_t*) abort_flag.data_ptr();`
`259`	`259`	`void* kernelArgs[] =`
`260`	`260`	`{`
`261`	`261`	`(void*)& ctx,`
`@@ -266,6 +266,7 @@ void pg_all_reduce`
`266`	`266`	`(void*)& shbuf_ptr,`
`267`	`267`	`(void*)& data_size,`
`268`	`268`	`(void*)& shbuf_size,`
	`269`	`+ (void*)& abort_flag_ptr`
`269`	`270`	`};`
`270`	`271`
`271`	`272`	`dim3 block_grid(2);`
Original file line number	Diff line number	Diff line change
`@@ -296,8 +296,6 @@ void perform_cpu_reduce`
`296`	`296`	`}`
`297`	`297`	`}`
`298`	`298`
`299`		`-__device__ bool pg_all_reduce_cpu_kernel_timeout;`
`300`		`-`
`301`	`299`	`#define PARCK_MODE_FLOAT 0`
`302`	`300`	`#define PARCK_MODE_HALF 1`
`303`	`301`	`#define PARCK_MODE_BF16 2`
`@@ -314,7 +312,8 @@ void pg_all_reduce_cpu_kernel`
`314`	`312`	`uint8_t* __restrict__ shbuf_ptr,`
`315`	`313`	`size_t data_size,`
`316`	`314`	`size_t shbuf_size,`
`317`		`- bool contributor`
	`315`	`+ bool contributor,`
	`316`	`+ uint32_t* abort_flag`
`318`	`317`	`)`
`319`	`318`	`{`
`320`	`319`	`// Indexing`
`@@ -335,8 +334,6 @@ void pg_all_reduce_cpu_kernel`
`335`	`334`
`336`	`335`	`int t = threadIdx.x;`
`337`	`336`	`int dir = blockIdx.x;`
`338`		`- if (t == 0)`
`339`		`- pg_all_reduce_cpu_kernel_timeout = false;`
`340`	`337`	`auto grid = cg::this_grid();`
`341`	`338`
`342`	`339`	`// Get device stage`
`@@ -453,14 +450,12 @@ void pg_all_reduce_cpu_kernel`
`453`	`450`	`if (sleep < SYNC_MAX_SLEEP) sleep <<= 1;`
`454`	`451`	`else if (check_timeout(ctx, deadline, "pg_all_reduce_cpu_kernel"))`
`455`	`452`	`{`
`456`		`- DBGI2(ep, stage);`
`457`		`- to = true;`
	`453`	`+ *abort_flag = 1;`
`458`	`454`	`break;`
`459`	`455`	`}`
`460`	`456`	`}`
`461`	`457`	`}`
`462`	`458`	`__syncthreads();`
`463`		`- if (to) pg_all_reduce_cpu_kernel_timeout = true;`
`464`	`459`
`465`	`460`	`// Recv float`
`466`	`461`	`if constexpr (dtype == PARCK_MODE_FLOAT)`
`@@ -512,7 +507,7 @@ void pg_all_reduce_cpu_kernel`
`512`	`507`	`}`
`513`	`508`
`514`	`509`	`grid.sync();`
`515`		`- if (pg_all_reduce_cpu_kernel_timeout) break;`
	`510`	`+ if (*abort_flag) break;`
`516`	`511`	`}`
`517`	`512`	`}`
`518`	`513`
`@@ -526,7 +521,8 @@ void pg_all_reduce_cpu`
`526`	`521`	`bool contributor,`
`527`	`522`	`uintptr_t shbuf,`
`528`	`523`	`size_t shbuf_size,`
`529`		`- bool is_master`
	`524`	`+ bool is_master,`
	`525`	`+ at::Tensor& abort_flag`
`530`	`526`	`)`
`531`	`527`	`{`
`532`	`528`	`const at::cuda::OptionalCUDAGuard device_guard(this_device);`
`@@ -543,6 +539,7 @@ void pg_all_reduce_cpu`
`543`	`539`
`544`	`540`	`TORCH_CHECK(cpu_data_size % 16 == 0, "data_size must be multiple of 16");`
`545`	`541`
	`542`	`+ uint32_t* abort_flag_ptr = (uint32_t*) abort_flag.data_ptr();`
`546`	`543`	`void* kernelArgs[] =`
`547`	`544`	`{`
`548`	`545`	`(void*)& ctx,`
`@@ -553,7 +550,8 @@ void pg_all_reduce_cpu`
`553`	`550`	`(void*)& shbuf_ptr,`
`554`	`551`	`(void*)& device_data_size,`
`555`	`552`	`(void*)& shbuf_size,`
`556`		`- (void*)& contributor`
	`553`	`+ (void*)& contributor,`
	`554`	`+ (void*)& abort_flag_ptr`
`557`	`555`	`};`
`558`	`556`
`559`	`557`	`dim3 block_grid(2);`
Original file line number	Diff line number	Diff line change
`@@ -14,18 +14,19 @@ __global__ void pg_barrier_kernel`
`14`	`14`	`PGContext* __restrict__ ctx,`
`15`	`15`	`uint32_t device_mask,`
`16`	`16`	`int this_device,`
`17`		`- int coordinator_device`
	`17`	`+ int coordinator_device,`
	`18`	`+ uint32_t* abort_flag`
`18`	`19`	`)`
`19`	`20`	`{`
`20`		`- pg_barrier_inner(ctx, device_mask, this_device, coordinator_device);`
	`21`	`+ pg_barrier_inner(ctx, device_mask, this_device, coordinator_device, abort_flag);`
`21`	`22`	`}`
`22`	`23`
`23`		`-`
`24`	`24`	`void pg_barrier`
`25`	`25`	`(`
`26`	`26`	`uintptr_t ctx,`
`27`	`27`	`std::vector<uintptr_t> devices,`
`28`		`- int this_device`
	`28`	`+ int this_device,`
	`29`	`+ at::Tensor& abort_flag`
`29`	`30`	`)`
`30`	`31`	`{`
`31`	`32`	`const at::cuda::OptionalCUDAGuard device_guard(this_device);`
`@@ -40,7 +41,8 @@ void pg_barrier`
`40`	`41`	`(PGContext*) ctx, // Shared, pinned`
`41`	`42`	`device_mask,`
`42`	`43`	`this_device,`
`43`		`- devices[0]`
	`44`	`+ devices[0],`
	`45`	`+ (uint32_t*) abort_flag.data_ptr()`
`44`	`46`	`);`
`45`	`47`	`cuda_check(cudaPeekAtLastError());`
`46`	`48`	`}`
Original file line number	Diff line number	Diff line change
`@@ -7,5 +7,6 @@ void pg_barrier`
`7`	`7`	`(`
`8`	`8`	`uintptr_t ctx,`
`9`	`9`	`std::vector<uintptr_t> devices,`
`10`		`- int this_device`
	`10`	`+ int this_device,`
	`11`	`+ at::Tensor& abort_flag`
`11`	`12`	`);`
Original file line number	Diff line number	Diff line change
`@@ -4,11 +4,10 @@ __device__ __forceinline__ void pg_barrier_inner`
`4`	`4`	`PGContext* __restrict__ ctx,`
`5`	`5`	`uint32_t device_mask,`
`6`	`6`	`int this_device,`
`7`		`- int coordinator_device`
	`7`	`+ int coordinator_device,`
	`8`	`+ uint32_t* abort_flag`
`8`	`9`	`)`
`9`	`10`	`{`
`10`		`- bool timeout = false;`
`11`		`-`
`12`	`11`	`if (!blockIdx.x && !blockIdx.y && !blockIdx.z && !threadIdx.x && !threadIdx.y && !threadIdx.z)`
`13`	`12`	`{`
`14`	`13`	`uint32_t* epoch_ptr = &ctx->barrier_epoch;`
`@@ -47,8 +46,8 @@ __device__ __forceinline__ void pg_barrier_inner`
`47`	`46`	`{`
`48`	`47`	`__nanosleep(sleep);`
`49`	`48`	`if (sleep < SYNC_MAX_SLEEP) sleep <<= 1;`
`50`		`- else timeout = check_timeout(ctx, deadline, "barrier");`
`51`		`- if (timeout) break;`
	`49`	`+ else *abort_flag = check_timeout(ctx, deadline, "barrier");`
	`50`	`+ if (*abort_flag) break;`
`52`	`51`	`}`
`53`	`52`	`else sleep = SYNC_MIN_SLEEP;`
`54`	`53`	`}`
`@@ -66,8 +65,8 @@ __device__ __forceinline__ void pg_barrier_inner`
`66`	`65`	`{`
`67`	`66`	`__nanosleep(sleep);`
`68`	`67`	`if (sleep < SYNC_MAX_SLEEP) sleep <<= 1;`
`69`		`- else timeout = check_timeout(ctx, deadline, "barrier");`
`70`		`- if (timeout) break;`
	`68`	`+ else *abort_flag = check_timeout(ctx, deadline, "barrier");`
	`69`	`+ if (*abort_flag) break;`
`71`	`70`	`}`
`72`	`71`	`}`
`73`	`72`	`}`