@@ -867,6 +867,86 @@ static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
867867    return  acl_tensor;
868868}
869869
870+ /* *
871+  * @brief Fills a tensor with a scalar value. 
872+  * 
873+  * This function fills the destination tensor `acl_dst` with the scalar value 
874+  * `scalar`. 
875+  * 
876+  * @param ctx The context for the CANN backend operations. 
877+  * @param scalar The scalar value used to fill the tensor. 
878+  * @param acl_dst The destination tensor to be filled with the scalar value. 
879+  */  
880+ static  void  aclnn_fill_scalar (ggml_backend_cann_context& ctx, float  scalar,
881+                               aclTensor* acl_dst) {
882+     auto  acl_scalar = aclCreateScalar (&scalar, aclDataType::ACL_FLOAT);
883+     GGML_CANN_CALL_ACLNN_OP (ctx, InplaceFillScalar, acl_dst, acl_scalar);
884+     ggml_cann_release_resources (ctx, acl_scalar);
885+ }
886+ 
887+ /* *
888+  * @brief Get or expand a cached float32 tensor filled with a scalar value. 
889+  * 
890+  * This function manages cached device memory for float32 tensors. If the current 
891+  * cache size is insufficient for the requested tensor shape, the old memory will 
892+  * be released and new memory will be allocated. The allocated buffer is then 
893+  * initialized either with zeros (when @p value == 0.0f) or with the given scalar 
894+  * value using CANN operations. Finally, an aclTensor object is created from the 
895+  * cached memory and returned. 
896+  * 
897+  * @param ctx           The CANN backend context that manages device memory. 
898+  * @param buffer        A pointer to the cached device buffer (will be allocated 
899+  *                      or reallocated if necessary). 
900+  * @param cache_element The current number of cached elements. This will be 
901+  *                      updated when the cache is expanded. 
902+  * @param ne            The tensor shape array (number of elements in each dimension). 
903+  * @param nb            The stride size for each dimension. 
904+  * @param dims          The number of tensor dimensions. 
905+  * @param value         The scalar value used to fill the tensor (supports zero 
906+  *                      initialization via memset or arbitrary values via fill_scalar). 
907+  * @return              An aclTensor pointer created from the cached buffer. 
908+  */  
909+ static  aclTensor* get_f32_cache_acl_tensor (
910+     ggml_backend_cann_context& ctx,
911+     void ** buffer,
912+     int64_t  &cache_element,
913+     int64_t * ne,
914+     size_t * nb,
915+     int64_t  dims,
916+     float  value) {
917+     //  Calculate total number of elements
918+     int64_t  n_element = 1 ;
919+     for  (int  i = 0 ; i < dims; i++) {
920+         n_element *= ne[i];
921+     }
922+     size_t  size = n_element * sizeof (float );
923+ 
924+     //  Allocate or expand cache if needed
925+     if  (cache_element < n_element) {
926+         if  (*buffer != nullptr ) {
927+             aclrtFree (*buffer);
928+             *buffer = nullptr ;
929+         }
930+ 
931+         ACL_CHECK (aclrtMalloc (buffer, size, ACL_MEM_MALLOC_HUGE_FIRST));
932+         cache_element = n_element;
933+ 
934+         //  Initialize cache
935+         if  (value == 0 .0f ) {
936+             ACL_CHECK (aclrtMemsetAsync (*buffer, size, 0 , size, ctx.stream ()));
937+         } else  {
938+             int64_t  pool_ne[1 ] = { n_element };
939+             size_t  pool_nb[1 ] = { sizeof (float ) };
940+             aclTensor* acl_value = ggml_cann_create_tensor (
941+                 *buffer, ACL_FLOAT, sizeof (float ), pool_ne, pool_nb, 1 );
942+             aclnn_fill_scalar (ctx, 1 , acl_value);
943+             ggml_cann_release_resources (ctx, acl_value);
944+         }
945+     }
946+ 
947+     return  ggml_cann_create_tensor (*buffer, ACL_FLOAT, sizeof (float ), ne, nb, dims);
948+ }
949+ 
870950void  ggml_cann_rms_norm (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
871951    ggml_tensor* src = dst->src [0 ];
872952
@@ -875,20 +955,39 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
875955
876956    float  eps;
877957    memcpy (&eps, dst->op_params , sizeof (float ));
878-     size_t  one_tensor_n_bytes = src->ne [0 ] * ggml_element_size (src);
879-     ggml_cann_pool_alloc one_tensor_allocator (ctx.pool (), one_tensor_n_bytes);
880- 
881-     aclTensor* acl_gamma = aclnn_values (
882-         ctx, one_tensor_allocator.get (), one_tensor_n_bytes, src->ne , 1 ,
883-         ggml_cann_type_mapping (src->type ), ggml_element_size (src));
884- 
885-     size_t  zero_tensor_n_bytes =
886-         src->ne [1 ] * src->ne [2 ] * src->ne [3 ] * ggml_element_size (src);
887-     ggml_cann_pool_alloc zero_tensor_allocator (ctx.pool (), zero_tensor_n_bytes);
888-     aclTensor* acl_rstd =
889-         aclnn_zero (ctx, zero_tensor_allocator.get (), zero_tensor_n_bytes,
890-                    src->ne , GGML_MAX_DIMS, ggml_cann_type_mapping (src->type ),
891-                    ggml_element_size (src));
958+ 
959+     //  build gamma, one...
960+     size_t  acl_gamma_nb[GGML_MAX_DIMS];
961+     acl_gamma_nb[0 ] = sizeof (float );
962+     for  (int  i = 1 ; i < GGML_MAX_DIMS; i++) {
963+         acl_gamma_nb[i] = acl_gamma_nb[i - 1 ] * src->ne [i - 1 ];
964+     }
965+     aclTensor* acl_gamma = get_f32_cache_acl_tensor (
966+         ctx,
967+         &ctx.f32_one_cache ,
968+         ctx.f32_one_cache_element ,
969+         src->ne ,
970+         acl_gamma_nb,
971+         1 ,        //  dims
972+         1 .0f       //  value
973+     );
974+ 
975+     //  build rstd, zero...
976+     size_t  acl_rstd_nb[GGML_MAX_DIMS];
977+     acl_rstd_nb[0 ] = sizeof (float );
978+     for  (int  i = 1 ; i < GGML_MAX_DIMS; i++) {
979+         acl_rstd_nb[i] = acl_rstd_nb[i - 1 ] * src->ne [i - 1 ];
980+     }
981+     aclTensor* acl_rstd = get_f32_cache_acl_tensor (
982+         ctx,
983+         &ctx.f32_zero_cache ,
984+         ctx.f32_zero_cache_element ,
985+         src->ne ,
986+         acl_rstd_nb,
987+         GGML_MAX_DIMS,
988+         0 .0f       //  value
989+     );
990+ 
892991    GGML_CANN_CALL_ACLNN_OP (ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
893992    ggml_cann_release_resources (ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
894993}
@@ -903,14 +1002,13 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
9031002
9041003    const  int  n_past = ((int32_t *)dst->op_params )[0 ];
9051004
906-     size_t  one_tensor_n_bytes = src->ne [0 ] * src->ne [1 ] * src->ne [2 ] *
907-                                 src->ne [3 ] * ggml_element_size (src);
908-     ggml_cann_pool_alloc one_tensor_allocator (ctx.pool (), one_tensor_n_bytes);
1005+     ggml_cann_pool_alloc one_tensor_allocator (ctx.pool (), ggml_nbytes (src));
1006+     void * buffer = one_tensor_allocator.get ();
9091007
910-     aclTensor* mask_tensor =
911-         aclnn_values (ctx, one_tensor_allocator. get (), one_tensor_n_bytes, 
912-                      src-> ne , GGML_MAX_DIMS,  ggml_cann_type_mapping (src-> type ), 
913-                       ggml_element_size (src) , value);
1008+     aclTensor* mask_tensor =  ggml_cann_create_tensor (buffer,  ggml_cann_type_mapping (src-> type ), 
1009+         ggml_type_size (src-> type ), src-> ne , src-> nb , GGML_MAX_DIMS); 
1010+ 
1011+     aclnn_fill_scalar (ctx , value, mask_tensor );
9141012
9151013    aclScalar* alpha = nullptr ;
9161014    float  alphaValue = 1 .0f ;
@@ -1277,23 +1375,6 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
12771375        tmp_permute_tensor, tmp_mul_tensor, acl_dst);
12781376}
12791377
1280- /* *
1281-  * @brief Fills a tensor with a scalar value. 
1282-  * 
1283-  * This function fills the destination tensor `acl_dst` with the scalar value 
1284-  * `scalar`. 
1285-  * 
1286-  * @param ctx The context for the CANN backend operations. 
1287-  * @param scalar The scalar value used to fill the tensor. 
1288-  * @param acl_dst The destination tensor to be filled with the scalar value. 
1289-  */  
1290- static  void  aclnn_fill_scalar (ggml_backend_cann_context& ctx, float  scalar,
1291-                               aclTensor* acl_dst) {
1292-     auto  acl_scalar = aclCreateScalar (&scalar, aclDataType::ACL_FLOAT);
1293-     GGML_CANN_CALL_ACLNN_OP (ctx, InplaceFillScalar, acl_dst, acl_scalar);
1294-     ggml_cann_release_resources (ctx, acl_scalar);
1295- }
1296- 
12971378/* *
12981379 * @brief Raises each element of a tensor to the power of the corresponding 
12991380 * element in another tensor. 
0 commit comments