Fix out_prod CI issues

makaveli10 · makaveli10 · commit a71fc3726f9f · 2025-08-28T12:27:48.000-04:00
diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp
@@ -93,4 +93,4 @@ int main(int argc, char ** argv) {
     llama_backend_free();
 
     return 0;
-}
+}
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3202,7 +3202,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 }
             } break;
         case GGML_OP_OUT_PROD:
-            // return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
             return op->type == GGML_TYPE_F32;
         case GGML_OP_GET_ROWS:
             {
diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu
@@ -12,51 +12,46 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const bool src0_is_quantized = (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16);
     const bool src1_is_quantized = (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16);
 
-    // if (src0_is_quantized || src1_is_quantized) {
-    //     printf("DEBUG: OUT_PROD with quantized tensors - src0_quantized=%d, src1_quantized=%d\n", 
-    //            src0_is_quantized, src1_is_quantized);
-    //     fflush(stdout);
-    // }
-
-    // GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    // GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
     GGML_ASSERT(dst->type  == GGML_TYPE_F32);
 
+    cudaStream_t   stream = ctx.stream();
+    ggml_cuda_pool & pool = ctx.pool();
+
     // temp buffers
     float * src0_f32 = nullptr;
     float * src1_f32 = nullptr;
     bool allocated_src0 = false;
     bool allocated_src1 = false;
-    cudaStream_t   stream = ctx.stream();
+    ggml_cuda_pool_alloc<float> src0_alloc(pool);
+    ggml_cuda_pool_alloc<float> src1_alloc(pool);
 
     if (src0_is_quantized) {
-        const size_t src0_size = ggml_nelements(src0) * sizeof(float);
-        CUDA_CHECK(cudaMallocAsync(&src0_f32, src0_size, stream));
+        const size_t src0_size = ggml_nelements(src0);
+        src0_alloc.alloc(src0_size);
+        src0_f32 = src0_alloc.ptr;
         allocated_src0 = true;
 
         // Dequantize
         auto dequantize_fn = ggml_get_to_fp32_cuda(src0->type);
         if (dequantize_fn) {
             dequantize_fn(src0->data, src0_f32, ggml_nelements(src0), stream);
         } else {
-            CUDA_CHECK(cudaFreeAsync(src0_f32, stream));
             GGML_ABORT("Unsupported quant type for src0");
         }
     } else {
         src0_f32 = (float *) src0->data;
     } 
 
     if (src1_is_quantized) {
-        const size_t src1_size = ggml_nelements(src1) * sizeof(float);
-        CUDA_CHECK(cudaMallocAsync(&src1_f32, src1_size, stream));
+        const size_t src1_size = ggml_nelements(src1);
+        src1_alloc.alloc(src1_size);
+        src1_f32 = src1_alloc.ptr;
         allocated_src1 = true;
 
         auto dequantize_fn = ggml_get_to_fp32_cuda(src1->type);
         if (dequantize_fn) {
-            dequantize_fn(src1->data, src1_f32, ggml_nelements(src0), stream);
+            dequantize_fn(src1->data, src1_f32, ggml_nelements(src1), stream);
         } else {
-            CUDA_CHECK(cudaFreeAsync(src1_f32, stream));
             GGML_ABORT("Unsupported quant type for src1");
         }
     } else {
@@ -74,9 +69,6 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(ne2 == src1->ne[2]);
     GGML_ASSERT(ne3 == src1->ne[3]);
 
-    // const float * src0_d = (const float *) src0->data;
-    // const float * src1_d = (const float *) src1->data;
-
     // Use dequantized data
     const float * src0_d = src0_f32;
     const float * src1_d = src1_f32;
@@ -89,28 +81,21 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     CUBLAS_CHECK(cublasSetStream(handle, stream));
 
-    // const int64_t lda = nb01 / sizeof(float);
     const int64_t lda = allocated_src0 ? ne00 : (nb01 / sizeof(float));
     const int64_t ldc = nb1  / sizeof(float);
 
     const bool src1_T = ggml_is_transposed(src1);
     const cublasOperation_t src1_cublas_op =  src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;
-    // const int64_t           ldb            = (src1_T ?        nb10 :        nb11) /  sizeof(float);
     const int64_t           ldb            = allocated_src1 ? 
                                              (src1_T ? ne10 : ne11) :
                                              ((src1_T ?        nb10 :        nb11) /  sizeof(float));
                                 
-    // GGML_ASSERT(                             (src1_T ?        nb11 :        nb10) == sizeof(float));
     // Only assert for non dequantized src1
     if (!allocated_src1) {
         GGML_ASSERT((src1_T ? nb11 : nb10) == sizeof(float));
     }
 
     // data strides in dimensions 2/3
-    // const size_t s02 = nb02 / sizeof(float);
-    // const size_t s03 = nb03 / sizeof(float);
-    // const size_t s12 = nb12 / sizeof(float);
-    // const size_t s13 = nb13 / sizeof(float);
     const size_t s02 = allocated_src0 ? (ne00 * ne01) : nb02 / sizeof(float);
     const size_t s03 = allocated_src0 ? (ne00 * ne01 * ne02): nb03 / sizeof(float);
     const size_t s12 = allocated_src1 ? (ne10 * ne11) :  nb12 / sizeof(float);
@@ -134,15 +119,4 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
         }
     }
 
-    if (allocated_src0) {
-        CUDA_CHECK(cudaFreeAsync(src0_f32, stream));
-        // printf("DEBUG: Freed dequantized src0 buffer\n");
-    }
-    if (allocated_src1) {
-        CUDA_CHECK(cudaFreeAsync(src1_f32, stream));
-        // // printf("DEBUG: Freed dequantized src1 buffer\n");
-    }
-    
-    // printf("DEBUG: CUDA OUT_PROD completed successfully\n");
-    fflush(stdout);
 }
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -9364,9 +9364,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
         break;
     
     case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-        // std::cerr << "*** GGML_VK_BUILD_GRAPH: CROSS_ENTROPY_LOSS_BACK case hit, calling ggml_vk_cross_entropy_loss_back" << std::endl;
-        // std::cout << "*** GGML_VK_BUILD_GRAPH: CROSS_ENTROPY_LOSS_BACK case hit, calling ggml_vk_cross_entropy_loss_back" << std::endl;
-        // fflush(stdout); fflush(stderr);
         ggml_vk_cross_entropy_loss_back(ctx, compute_ctx, src0, src1, src2, node, dryrun);
 
         break;
@@ -10692,7 +10689,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
         case GGML_OP_SUB:
         case GGML_OP_MUL:
         case GGML_OP_DIV:
-	    return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
+	        return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
                    (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16) &&
                    (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
         case GGML_OP_SILU_BACK:
diff --git a/src/llama-lora-training.cpp b/src/llama-lora-training.cpp
@@ -290,7 +290,7 @@ struct llama_adapter_lora * llama_lora_training_init(
 bool llama_opt_param_filter_lora(const struct ggml_tensor * tensor, void * userdata) {
     (void) userdata; // Unused param
 
-    if (!tensor || !tensor->name) {
+    if (!tensor) {
         return false;
     }
 

Original file line number	Diff line number	Diff line change
`@@ -3202,7 +3202,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g`
`3202`	`3202`	`}`
`3203`	`3203`	`} break;`
`3204`	`3204`	`case GGML_OP_OUT_PROD:`
`3205`		`- // return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;`
`3206`	`3205`	`return op->type == GGML_TYPE_F32;`
`3207`	`3206`	`case GGML_OP_GET_ROWS:`
`3208`	`3207`	`{`
Original file line number	Diff line number	Diff line change
`@@ -290,7 +290,7 @@ struct llama_adapter_lora * llama_lora_training_init(`
`290`	`290`	`bool llama_opt_param_filter_lora(const struct ggml_tensor * tensor, void * userdata) {`
`291`	`291`	`(void) userdata; // Unused param`
`292`	`292`
`293`		`- if (!tensor \|\| !tensor->name) {`
	`293`	`+ if (!tensor) {`
`294`	`294`	`return false;`
`295`	`295`	`}`
`296`	`296`