ggml-org
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ggml/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion b/‎ggml/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎ggml/include/ggml-alloc.h‎
Lines changed: 0 additions & 1 deletion b/‎ggml/include/ggml-alloc.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎ggml/src/CMakeLists.txt‎
Lines changed: 0 additions & 6 deletions b/‎ggml/src/CMakeLists.txt‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎ggml/src/ggml-alloc.c‎
Lines changed: 0 additions & 86 deletions b/‎ggml/src/ggml-alloc.c‎
Lines changed: 0 additions & 86 deletions
@@ -654,7 +654,7 @@ jobs:
           - build: 'msvc-arm64'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'llvm-arm64-opencl-adreno'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH=${{github.workspace}}/opencl-x64-release -DGGML_OPENCL=ON -DGGML_OPENCL_SMALL_ALLOC=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON -DGGML_OPENCL_EMBED_KERNELS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH=${{github.workspace}}/opencl-arm64-release -DGGML_OPENCL=ON -DGGML_OPENCL_SMALL_ALLOC=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON -DGGML_OPENCL_EMBED_KERNELS=ON'
 
     steps:
       - name: Clone
@@ -707,7 +707,7 @@ jobs:
             -DBUILD_TESTING=OFF `
             -DOPENCL_HEADERS_BUILD_TESTING=OFF `
             -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-            -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/opencl-x64-release
+            -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/opencl-arm64-release
           cmake --build . --target install
           git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
           cd OpenCL-ICD-Loader
 
@@ -170,7 +170,6 @@ set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
                                             "ggml: sycl device architecture")
 
 option(GGML_OPENCL                          "ggml: use OpenCL"                                OFF)
-option(GGML_OPENCL_SMALL_ALLOC              "ggml: use small allocation for tensors"          ON)
 option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF)
 option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON)
 option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
 
@@ -69,7 +69,6 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft_for_weights(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
 
 #ifdef  __cplusplus
 
@@ -269,15 +269,9 @@ function(ggml_add_backend backend)
     endif()
 endfunction()
 
-# TODO: This is intrusive. We intend to remove SMALL_ALLOC path once the we fully
-# migrate to the non SMALL_ALLOC path. Also need to converge on the backend name
-# so we don't need this name conversion.
 if (GGML_OPENCL)
     set(GGML_OPENCL2 ON)
     add_compile_definitions(GGML_USE_OPENCL)
-    if (GGML_OPENCL_SMALL_ALLOC)
-        add_compile_definitions(GGML_OPENCL_SMALL_ALLOC)
-    endif ()
 else ()
     set(GGML_OPENCL2 OFF)
 endif ()
 
@@ -1033,92 +1033,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
     return buffer;
 }
 
-ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_for_weights(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
-#ifndef GGML_OPENCL_SMALL_ALLOC
-    return ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-#else
-    // Small allocation allocates a separate buffer for each tensor. Instead of
-    // collecting multiple tensors to allocate a large buffer, each tensor is
-    // allocated a buffer immediately. This is only supposed to be used for
-    // weights tensors (note that weights can be f32).
-    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
-
-    size_t alignment = ggml_backend_buft_get_alignment(buft);
-
-    ggml_backend_buffer_t * buffers = NULL;
-    size_t n_buffers = 0;
-
-    struct ggml_tensor * first_view = NULL;
-    struct ggml_tensor * first = ggml_get_first_tensor(ctx);
-    for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        size_t this_size = 0;
-        if (t->data == NULL && t->view_src == NULL) {
-            // Tensor size must be properly padded.
-            this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
-        }
-
-        // The allocation logic here has gone beyond intention in order to make
-        // `test-backend-ops` work. The very initial intention was to allocate
-        // memory for weights - each weight tensor gets its own buffer object.
-        // The original function should be used to allocate for intermediate tensors.
-        // There are usually no view tensors for weights; this is not true for
-        // intermediate tensors. However, in `test-backend-ops` there is no
-        // differetiation between weight tensors and intermediate tensors.
-        // This function is used for general allocation when small allocation is
-        // enabled in the test. This requires the function to also handle view
-        // tensors, which do no require actual allocation. In the original function,
-        // view tensors are allocated with other non-view tensors since view tensors
-        // sizes are 0.
-        // Here, we try to identify view tensors and allocate them with the next
-        // non-view tensor. View tensors cannot allocated (alone) but must be
-        // initialized (together with non-view tensors).
-
-        // This is a view tensor of its size if 0. Record its location if it is the
-        // first one after a non-view tensor. If the next tensor is still a view,
-        // simply go to the next. We want to allocate all consecutive view tensors
-        // together with the next non-view tensor.
-        if (this_size == 0 && first_view == NULL) {
-            first_view = t;
-            continue;
-        }
-
-        if (first_view) {
-            // This is a non-view tensor. If there are any view tensors before
-            // this non-view tensor, we want to allocate these view tensors and
-            // this non-view tensor together.
-            // The first tensor to allocate is the first view tensor.
-            first = first_view;
-        } else {
-            // Otherwise, allocate this non-view tensor immediately.
-            first = t;
-        }
-
-        if (!alloc_tensor_range(ctx, first, ggml_get_next_tensor(ctx, t), buft, this_size, &buffers, &n_buffers)) {
-            return NULL;
-        }
-
-        // Always reset first_view after a non-view tensor.
-        first_view = NULL;
-    }
-
-    if (n_buffers == 0) {
-#ifndef NDEBUG
-        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
-#endif
-        return NULL;
-    }
-
-    ggml_backend_buffer_t buffer;
-    if (n_buffers == 1) {
-        buffer = buffers[0];
-    } else {
-        buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
-    }
-    free(buffers);
-    return buffer;
-#endif
-}
-
 ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
     return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
 }