@@ -12,51 +12,46 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
12
12
const bool src0_is_quantized = (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16);
13
13
const bool src1_is_quantized = (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16);
14
14
15
- // if (src0_is_quantized || src1_is_quantized) {
16
- // printf("DEBUG: OUT_PROD with quantized tensors - src0_quantized=%d, src1_quantized=%d\n",
17
- // src0_is_quantized, src1_is_quantized);
18
- // fflush(stdout);
19
- // }
20
-
21
- // GGML_ASSERT(src0->type == GGML_TYPE_F32);
22
- // GGML_ASSERT(src1->type == GGML_TYPE_F32);
23
-
24
15
GGML_ASSERT (dst->type == GGML_TYPE_F32);
25
16
17
+ cudaStream_t stream = ctx.stream ();
18
+ ggml_cuda_pool & pool = ctx.pool ();
19
+
26
20
// temp buffers
27
21
float * src0_f32 = nullptr ;
28
22
float * src1_f32 = nullptr ;
29
23
bool allocated_src0 = false ;
30
24
bool allocated_src1 = false ;
31
- cudaStream_t stream = ctx.stream ();
25
+ ggml_cuda_pool_alloc<float > src0_alloc (pool);
26
+ ggml_cuda_pool_alloc<float > src1_alloc (pool);
32
27
33
28
if (src0_is_quantized) {
34
- const size_t src0_size = ggml_nelements (src0) * sizeof (float );
35
- CUDA_CHECK (cudaMallocAsync (&src0_f32, src0_size, stream));
29
+ const size_t src0_size = ggml_nelements (src0);
30
+ src0_alloc.alloc (src0_size);
31
+ src0_f32 = src0_alloc.ptr ;
36
32
allocated_src0 = true ;
37
33
38
34
// Dequantize
39
35
auto dequantize_fn = ggml_get_to_fp32_cuda (src0->type );
40
36
if (dequantize_fn) {
41
37
dequantize_fn (src0->data , src0_f32, ggml_nelements (src0), stream);
42
38
} else {
43
- CUDA_CHECK (cudaFreeAsync (src0_f32, stream));
44
39
GGML_ABORT (" Unsupported quant type for src0" );
45
40
}
46
41
} else {
47
42
src0_f32 = (float *) src0->data ;
48
43
}
49
44
50
45
if (src1_is_quantized) {
51
- const size_t src1_size = ggml_nelements (src1) * sizeof (float );
52
- CUDA_CHECK (cudaMallocAsync (&src1_f32, src1_size, stream));
46
+ const size_t src1_size = ggml_nelements (src1);
47
+ src1_alloc.alloc (src1_size);
48
+ src1_f32 = src1_alloc.ptr ;
53
49
allocated_src1 = true ;
54
50
55
51
auto dequantize_fn = ggml_get_to_fp32_cuda (src1->type );
56
52
if (dequantize_fn) {
57
- dequantize_fn (src1->data , src1_f32, ggml_nelements (src0 ), stream);
53
+ dequantize_fn (src1->data , src1_f32, ggml_nelements (src1 ), stream);
58
54
} else {
59
- CUDA_CHECK (cudaFreeAsync (src1_f32, stream));
60
55
GGML_ABORT (" Unsupported quant type for src1" );
61
56
}
62
57
} else {
@@ -74,9 +69,6 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
74
69
GGML_ASSERT (ne2 == src1->ne [2 ]);
75
70
GGML_ASSERT (ne3 == src1->ne [3 ]);
76
71
77
- // const float * src0_d = (const float *) src0->data;
78
- // const float * src1_d = (const float *) src1->data;
79
-
80
72
// Use dequantized data
81
73
const float * src0_d = src0_f32;
82
74
const float * src1_d = src1_f32;
@@ -89,28 +81,21 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
89
81
90
82
CUBLAS_CHECK (cublasSetStream (handle, stream));
91
83
92
- // const int64_t lda = nb01 / sizeof(float);
93
84
const int64_t lda = allocated_src0 ? ne00 : (nb01 / sizeof (float ));
94
85
const int64_t ldc = nb1 / sizeof (float );
95
86
96
87
const bool src1_T = ggml_is_transposed (src1);
97
88
const cublasOperation_t src1_cublas_op = src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;
98
- // const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
99
89
const int64_t ldb = allocated_src1 ?
100
90
(src1_T ? ne10 : ne11) :
101
91
((src1_T ? nb10 : nb11) / sizeof (float ));
102
92
103
- // GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float));
104
93
// Only assert for non dequantized src1
105
94
if (!allocated_src1) {
106
95
GGML_ASSERT ((src1_T ? nb11 : nb10) == sizeof (float ));
107
96
}
108
97
109
98
// data strides in dimensions 2/3
110
- // const size_t s02 = nb02 / sizeof(float);
111
- // const size_t s03 = nb03 / sizeof(float);
112
- // const size_t s12 = nb12 / sizeof(float);
113
- // const size_t s13 = nb13 / sizeof(float);
114
99
const size_t s02 = allocated_src0 ? (ne00 * ne01) : nb02 / sizeof (float );
115
100
const size_t s03 = allocated_src0 ? (ne00 * ne01 * ne02): nb03 / sizeof (float );
116
101
const size_t s12 = allocated_src1 ? (ne10 * ne11) : nb12 / sizeof (float );
@@ -134,15 +119,4 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
134
119
}
135
120
}
136
121
137
- if (allocated_src0) {
138
- CUDA_CHECK (cudaFreeAsync (src0_f32, stream));
139
- // printf("DEBUG: Freed dequantized src0 buffer\n");
140
- }
141
- if (allocated_src1) {
142
- CUDA_CHECK (cudaFreeAsync (src1_f32, stream));
143
- // // printf("DEBUG: Freed dequantized src1 buffer\n");
144
- }
145
-
146
- // printf("DEBUG: CUDA OUT_PROD completed successfully\n");
147
- fflush (stdout);
148
122
}
0 commit comments