@@ -12,15 +12,6 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
12
12
const bool src0_is_quantized = (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16);
13
13
const bool src1_is_quantized = (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16);
14
14
15
- // if (src0_is_quantized || src1_is_quantized) {
16
- // printf("DEBUG: OUT_PROD with quantized tensors - src0_quantized=%d, src1_quantized=%d\n",
17
- // src0_is_quantized, src1_is_quantized);
18
- // fflush(stdout);
19
- // }
20
-
21
- // GGML_ASSERT(src0->type == GGML_TYPE_F32);
22
- // GGML_ASSERT(src1->type == GGML_TYPE_F32);
23
-
24
15
GGML_ASSERT (dst->type == GGML_TYPE_F32);
25
16
26
17
// temp buffers
@@ -74,9 +65,6 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
74
65
GGML_ASSERT (ne2 == src1->ne [2 ]);
75
66
GGML_ASSERT (ne3 == src1->ne [3 ]);
76
67
77
- // const float * src0_d = (const float *) src0->data;
78
- // const float * src1_d = (const float *) src1->data;
79
-
80
68
// Use dequantized data
81
69
const float * src0_d = src0_f32;
82
70
const float * src1_d = src1_f32;
@@ -89,28 +77,21 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
89
77
90
78
CUBLAS_CHECK (cublasSetStream (handle, stream));
91
79
92
- // const int64_t lda = nb01 / sizeof(float);
93
80
const int64_t lda = allocated_src0 ? ne00 : (nb01 / sizeof (float ));
94
81
const int64_t ldc = nb1 / sizeof (float );
95
82
96
83
const bool src1_T = ggml_is_transposed (src1);
97
84
const cublasOperation_t src1_cublas_op = src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;
98
- // const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
99
85
const int64_t ldb = allocated_src1 ?
100
86
(src1_T ? ne10 : ne11) :
101
87
((src1_T ? nb10 : nb11) / sizeof (float ));
102
88
103
- // GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float));
104
89
// Only assert for non dequantized src1
105
90
if (!allocated_src1) {
106
91
GGML_ASSERT ((src1_T ? nb11 : nb10) == sizeof (float ));
107
92
}
108
93
109
94
// data strides in dimensions 2/3
110
- // const size_t s02 = nb02 / sizeof(float);
111
- // const size_t s03 = nb03 / sizeof(float);
112
- // const size_t s12 = nb12 / sizeof(float);
113
- // const size_t s13 = nb13 / sizeof(float);
114
95
const size_t s02 = allocated_src0 ? (ne00 * ne01) : nb02 / sizeof (float );
115
96
const size_t s03 = allocated_src0 ? (ne00 * ne01 * ne02): nb03 / sizeof (float );
116
97
const size_t s12 = allocated_src1 ? (ne10 * ne11) : nb12 / sizeof (float );
@@ -136,13 +117,8 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
136
117
137
118
if (allocated_src0) {
138
119
CUDA_CHECK (cudaFreeAsync (src0_f32, stream));
139
- // printf("DEBUG: Freed dequantized src0 buffer\n");
140
120
}
141
121
if (allocated_src1) {
142
122
CUDA_CHECK (cudaFreeAsync (src1_f32, stream));
143
- // // printf("DEBUG: Freed dequantized src1 buffer\n");
144
123
}
145
-
146
- // printf("DEBUG: CUDA OUT_PROD completed successfully\n");
147
- fflush (stdout);
148
124
}
0 commit comments