@@ -12,15 +12,6 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
1212 const bool src0_is_quantized = (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16);
1313 const bool src1_is_quantized = (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16);
1414
15- // if (src0_is_quantized || src1_is_quantized) {
16- // printf("DEBUG: OUT_PROD with quantized tensors - src0_quantized=%d, src1_quantized=%d\n",
17- // src0_is_quantized, src1_is_quantized);
18- // fflush(stdout);
19- // }
20-
21- // GGML_ASSERT(src0->type == GGML_TYPE_F32);
22- // GGML_ASSERT(src1->type == GGML_TYPE_F32);
23-
2415 GGML_ASSERT (dst->type == GGML_TYPE_F32);
2516
2617 // temp buffers
@@ -74,9 +65,6 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
7465 GGML_ASSERT (ne2 == src1->ne [2 ]);
7566 GGML_ASSERT (ne3 == src1->ne [3 ]);
7667
77- // const float * src0_d = (const float *) src0->data;
78- // const float * src1_d = (const float *) src1->data;
79-
8068 // Use dequantized data
8169 const float * src0_d = src0_f32;
8270 const float * src1_d = src1_f32;
@@ -89,28 +77,21 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
8977
9078 CUBLAS_CHECK (cublasSetStream (handle, stream));
9179
92- // const int64_t lda = nb01 / sizeof(float);
9380 const int64_t lda = allocated_src0 ? ne00 : (nb01 / sizeof (float ));
9481 const int64_t ldc = nb1 / sizeof (float );
9582
9683 const bool src1_T = ggml_is_transposed (src1);
9784 const cublasOperation_t src1_cublas_op = src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;
98- // const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
9985 const int64_t ldb = allocated_src1 ?
10086 (src1_T ? ne10 : ne11) :
10187 ((src1_T ? nb10 : nb11) / sizeof (float ));
10288
103- // GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float));
10489 // Only assert for non dequantized src1
10590 if (!allocated_src1) {
10691 GGML_ASSERT ((src1_T ? nb11 : nb10) == sizeof (float ));
10792 }
10893
10994 // data strides in dimensions 2/3
110- // const size_t s02 = nb02 / sizeof(float);
111- // const size_t s03 = nb03 / sizeof(float);
112- // const size_t s12 = nb12 / sizeof(float);
113- // const size_t s13 = nb13 / sizeof(float);
11495 const size_t s02 = allocated_src0 ? (ne00 * ne01) : nb02 / sizeof (float );
11596 const size_t s03 = allocated_src0 ? (ne00 * ne01 * ne02): nb03 / sizeof (float );
11697 const size_t s12 = allocated_src1 ? (ne10 * ne11) : nb12 / sizeof (float );
@@ -136,13 +117,8 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
136117
137118 if (allocated_src0) {
138119 CUDA_CHECK (cudaFreeAsync (src0_f32, stream));
139- // printf("DEBUG: Freed dequantized src0 buffer\n");
140120 }
141121 if (allocated_src1) {
142122 CUDA_CHECK (cudaFreeAsync (src1_f32, stream));
143- // // printf("DEBUG: Freed dequantized src1 buffer\n");
144123 }
145-
146- // printf("DEBUG: CUDA OUT_PROD completed successfully\n");
147- fflush (stdout);
148124}
0 commit comments