ggml-org
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 14 additions & 3 deletions b/‎CMakeLists.txt‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 78 deletions b/‎README.md‎
Lines changed: 1 addition & 78 deletions
diff --git a/‎ci/run.sh‎
Lines changed: 6 additions & 38 deletions b/‎ci/run.sh‎
Lines changed: 6 additions & 38 deletions
diff --git a/‎examples/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion b/‎examples/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/common.cpp‎
Lines changed: 17 additions & 1 deletion b/‎examples/common.cpp‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎examples/common.h‎
Lines changed: 2 additions & 2 deletions b/‎examples/common.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/gpt-2/CMakeLists.txt‎
Lines changed: 0 additions & 4 deletions b/‎examples/gpt-2/CMakeLists.txt‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎examples/gpt-2/main-sched.cpp‎
Lines changed: 14 additions & 0 deletions b/‎examples/gpt-2/main-sched.cpp‎
Lines changed: 14 additions & 0 deletions
@@ -61,7 +61,7 @@ jobs:
 
     - name: Configure CMake
       working-directory: ./build
-      run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON ..
+      run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON -DGGML_METAL=OFF ..
 
     - name: Build
       working-directory: ./build
@@ -112,7 +112,7 @@ jobs:
 
     - name: Configure CMake
       working-directory: ./build
-      run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON ..
+      run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON -DGGML_METAL=OFF ..
 
     - name: Build
       working-directory: ./build
 
@@ -1,4 +1,5 @@
 build/
+build-blas/
 build-debug/
 build-release/
 build-sanitize-addr/
@@ -30,6 +31,7 @@ tests/arm_neon.h
 zig-out/
 zig-cache/
 
+*.o
 *.dot
 
 *.sw?
 
@@ -25,6 +25,16 @@ endif()
 
 # options
 
+if (APPLE)
+    set(GGML_METAL_DEFAULT ON)
+    set(GGML_BLAS_DEFAULT ON)
+    set(GGML_BLAS_VENDOR_DEFAULT "Apple")
+else()
+    set(GGML_METAL_DEFAULT OFF)
+    set(GGML_BLAS_DEFAULT OFF)
+    set(GGML_BLAS_VENDOR_DEFAULT "Generic")
+endif()
+
 option(BUILD_SHARED_LIBS            "ggml: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
 
 option(GGML_ALL_WARNINGS            "ggml: enable all compiler warnings"                   ON)
@@ -41,12 +51,13 @@ option(GGML_TEST_COVERAGE           "ggml: enable test coverage" OFF)
 
 option(GGML_PERF                    "ggml: enable perf timings"               OFF)
 option(GGML_NO_ACCELERATE           "ggml: disable Accelerate framework"      OFF)
-option(GGML_OPENBLAS                "ggml: use OpenBLAS"                      OFF)
-option(GGML_CLBLAST                 "ggml: use clBLAST"                       OFF)
+option(GGML_BLAS                    "ggml: use BLAS"                          ${GGML_BLAS_DEFAULT})
+set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
+                                    "ggml: BLAS library vendor")
 option(GGML_HIPBLAS                 "ggml: use hipBLAS"                       OFF)
 option(GGML_CUDA                    "ggml: use CUDA"                          OFF)
 option(GGML_CUBLAS                  "ggml: use CUDA (deprecated)"             OFF)
-option(GGML_METAL                   "ggml: use Metal"                         OFF)
+option(GGML_METAL                   "ggml: use Metal"                         ${GGML_METAL_DEFAULT})
 option(GGML_METAL_NDEBUG            "ggml: disable Metal debugging"           OFF)
 option(GGML_METAL_SHADER_DEBUG      "ggml: compile Metal with -fno-fast-math" OFF)
 option(GGML_METAL_EMBED_LIBRARY     "ggml: embed Metal library"               OFF)
 
@@ -24,7 +24,7 @@ Some of the development is currently happening in the [llama.cpp](https://github
 
 - [X] Example of GPT-2 inference [examples/gpt-2](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2)
 - [X] Example of GPT-J inference [examples/gpt-j](https://github.com/ggerganov/ggml/tree/master/examples/gpt-j)
-- [X] Example of Whisper inference [examples/whisper](https://github.com/ggerganov/ggml/tree/master/examples/whisper)
+- [X] Example of Whisper inference [ggerganov/whisper.cpp](https://github.com/ggerganov/whisper.cpp)
 - [X] Example of LLaMA inference [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp)
 - [X] Example of LLaMA training [ggerganov/llama.cpp/examples/baby-llama](https://github.com/ggerganov/llama.cpp/tree/master/examples/baby-llama)
 - [X] Example of Falcon inference [cmp-nct/ggllm.cpp](https://github.com/cmp-nct/ggllm.cpp)
@@ -44,20 +44,6 @@ Some of the development is currently happening in the [llama.cpp](https://github
 - [X] Example of multiple LLMs inference [foldl/chatllm.cpp](https://github.com/foldl/chatllm.cpp)
 - [X] SeamlessM4T inference *(in development)* https://github.com/facebookresearch/seamless_communication/tree/main/ggml
 
-## Whisper inference (example)
-
-With ggml you can efficiently run [Whisper](examples/whisper) inference on the CPU.
-
-Memory requirements:
-
-| Model  | Disk   | Mem     |
-| ---    | ---    | ---     |
-| tiny   |  75 MB | ~280 MB |
-| base   | 142 MB | ~430 MB |
-| small  | 466 MB | ~1.0 GB |
-| medium | 1.5 GB | ~2.6 GB |
-| large  | 2.9 GB | ~4.7 GB |
-
 ## GPT inference (example)
 
 With ggml you can efficiently run [GPT-2](examples/gpt-2) and [GPT-J](examples/gpt-j) inference on the CPU.
@@ -128,11 +114,6 @@ cmake -DGGML_CUBLAS=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.1/bin/nvcc ..
 cmake -DCMAKE_C_COMPILER="$(hipconfig -l)/clang" -DCMAKE_CXX_COMPILER="$(hipconfig -l)/clang++" -DGGML_HIPBLAS=ON
 ```
 
-## Using clBLAST
-
-```bash
-cmake -DGGML_CLBLAST=ON ..
-```
 ## Compiling for Android
 
 Download and unzip the NDK from this download [page](https://developer.android.com/ndk/downloads). Set the NDK_ROOT_PATH environment variable or provide the absolute path to the CMAKE_ANDROID_NDK in the command below.
@@ -170,64 +151,6 @@ export LD_LIBRARY_PATH=/data/local/tmp
 ./bin/gpt-2-backend -m models/ggml-model.bin -p "this is an example"
 ```
 
-### CLBlast for Android
-
-Build CLBlast.
-
-```bash
-# In CLBlast/build
-$ANDROID_SDK_PATH/cmake/3.22.1/bin/cmake .. \
-    -DCMAKE_SYSTEM_NAME=Android \
-    -DCMAKE_SYSTEM_VERSION=33 \
-    -DCMAKE_ANDROID_ARCH_ABI=arm64-v8a \
-    -DCMAKE_ANDROID_NDK=$ANDROID_NDK_PATH \
-    -DCMAKE_ANDROID_STL_TYPE=c++_static \
-    -DOPENCL_ROOT=$(readlink -f ../../OpenCL-Headers) \
-    -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH \
-    -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-# Build libclblast.so
-make -j4
-```
-
-Pull `libGLES_mali.so` to `libOpenCL.so`.
-
-```bash
-# In ggml project root.
-mkdir arm64-v8a
-adb pull /system/vendor/lib64/egl/libGLES_mali.so arm64-v8a/libOpenCL.so
-```
-
-Build ggml with CLBlast.
-
-```bash
-# In ggml/build
-cd build
-$ANDROID_SDK_PATH/cmake/3.22.1/bin/cmake .. \
-    -DGGML_CLBLAST=ON \
-    -DCMAKE_SYSTEM_NAME=Android \
-    -DCMAKE_SYSTEM_VERSION=33 \
-    -DCMAKE_ANDROID_ARCH_ABI=arm64-v8a \
-    -DCMAKE_ANDROID_NDK=$ANDROID_NDK_PATH \
-    -DCMAKE_ANDROID_STL_TYPE=c++_shared \
-    -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
-    -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH \
-    -DCLBLAST_HOME=$(readlink -f ../../CLBlast) \
-    -DOPENCL_LIB=$(readlink -f ../arm64-v8a/libOpenCL.so)
-
-# Run make, adb push, etc.
-```
-
-Then in `adb shell`...
-
-```bash
-cd /data/local/tmp
-export LD_LIBRARY_PATH=/system/vendor/lib64/egl:/data/local/tmp
-./bin/gpt-2-backend -m models/ggml-model.bin -n 64 -p "Pepperoni pizza"
-```
-
-OpenCL does not have the same level of support in `ggml-backend` as CUDA or Metal. In the `gpt-2-backend` example, OpenCL will only be used for the matrix multiplications when evaluating large prompts.
-
 ## Resources
 
 - [GGML - Large Language Models for Everyone](https://github.com/rustformers/llm/blob/main/crates/ggml/README.md): a description of the GGML format provided by the maintainers of the `llm` Rust crate, which provides Rust bindings for GGML
 
@@ -218,39 +218,6 @@ function gg_sum_mnist {
     gg_printf '```\n'
 }
 
-# whisper
-
-function gg_run_whisper {
-    cd ${SRC}
-
-    gg_wget models-mnt/whisper/ https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin
-    gg_wget models-mnt/whisper/ https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav
-
-    cd build-ci-release
-
-    set -e
-
-    path_models="../models-mnt/whisper/"
-    model_f16="${path_models}/ggml-base.en.bin"
-    audio_0="${path_models}/jfk.wav"
-
-    (time ./bin/whisper -m ${model_f16} -f ${audio_0} ) 2>&1 | tee -a $OUT/${ci}-main.log
-
-    grep -q "And so my fellow Americans" $OUT/${ci}-main.log
-
-    set +e
-}
-
-function gg_sum_whisper {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs short Whisper transcription\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-main.log)"
-    gg_printf '```\n'
-}
-
 # sam
 
 function gg_run_sam {
@@ -344,11 +311,12 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
     export GGML_METAL_PATH_RESOURCES="${SRC}/build-ci-release/bin"
 fi
 
-test $ret -eq 0 && gg_run gpt_2
-test $ret -eq 0 && gg_run mnist
-test $ret -eq 0 && gg_run whisper
-test $ret -eq 0 && gg_run sam
-test $ret -eq 0 && gg_run yolo
+if [ -z ${GG_BUILD_NO_DOWNLOAD} ]; then
+    test $ret -eq 0 && gg_run gpt_2
+    test $ret -eq 0 && gg_run mnist
+    test $ret -eq 0 && gg_run sam
+    test $ret -eq 0 && gg_run yolo
+fi
 
 if [ -z $GG_BUILD_LOW_PERF ]; then
     if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 16 ]; then
 
@@ -20,7 +20,6 @@ target_include_directories(common-ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
 add_subdirectory(gpt-2)
 add_subdirectory(gpt-j)
-add_subdirectory(whisper)
 add_subdirectory(mnist)
 add_subdirectory(sam)
 add_subdirectory(yolo)
 
@@ -24,6 +24,11 @@
 #include <io.h>
 #endif
 
+#ifdef WHISPER_FFMPEG
+// as implemented in ffmpeg_trancode.cpp only embedded in common lib if whisper built with ffmpeg support
+extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
+#endif
+
 // Function to check if the next argument exists
 std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
     if (i + 1 < argc && argv[i + 1][0] != '-') {
@@ -637,7 +642,7 @@ bool is_wav_buffer(const std::string buf) {
 
 bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
     drwav wav;
-    std::vector<uint8_t> wav_data; // used for pipe input from stdin
+    std::vector<uint8_t> wav_data; // used for pipe input from stdin or ffmpeg decoding output
 
     if (fname == "-") {
         {
@@ -670,8 +675,19 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector
         }
     }
     else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
+#if defined(WHISPER_FFMPEG)
+        if (ffmpeg_decode_audio(fname, wav_data) != 0) {
+            fprintf(stderr, "error: failed to ffmpeg decode '%s' \n", fname.c_str());
+            return false;
+        }
+        if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
+            fprintf(stderr, "error: failed to read wav data as wav \n");
+            return false;
+        }
+#else
         fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
         return false;
+#endif
     }
 
     if (wav.channels != 1 && wav.channels != 2) {
 
@@ -21,7 +21,7 @@ struct gpt_params {
     int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
     int32_t n_predict    = 200;  // new tokens to predict
     int32_t n_parallel   = 1;    // number of parallel streams
-    int32_t n_batch      = 8;    // batch size for prompt processing
+    int32_t n_batch      = 32;   // batch size for prompt processing
     int32_t n_ctx        = 2048; // context size (this is the KV cache max size)
     int32_t n_gpu_layers = 0;    // number of layers to offlload to the GPU
 
@@ -185,7 +185,7 @@ class wav_writer {
     // It is assumed that PCM data is normalized to a range from -1 to 1
     bool write_audio(const float * data, size_t length) {
         for (size_t i = 0; i < length; ++i) {
-            const int16_t intSample = data[i] * 32767;
+            const int16_t intSample = int16_t(data[i] * 32767);
             file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
             dataSize += sizeof(int16_t);
         }
 
@@ -39,10 +39,6 @@ if (GGML_CUBLAS)
     add_compile_definitions(GGML_USE_CUBLAS)
 endif()
 
-if (GGML_CLBLAST)
-    add_compile_definitions(GGML_USE_CLBLAST)
-endif()
-
 if (GGML_METAL)
     add_compile_definitions(GGML_USE_METAL)
 endif()
@@ -10,6 +10,10 @@
 #include "ggml-metal.h"
 #endif
 
+#ifdef GGML_USE_BLAS
+#include "ggml-blas.h"
+#endif
+
 #include "common.h"
 #include "common-ggml.h"
 
@@ -131,6 +135,16 @@ void init_backends(gpt2_model & model, const gpt_params & params) {
         model.backends.push_back(gpu_backend);
     }
 
+#ifdef GGML_USE_BLAS
+    ggml_backend_t blas_backend = ggml_backend_blas_init();
+    if (!blas_backend) {
+        fprintf(stderr, "%s: failed to initialize BLAS backend\n", __func__);
+    } else {
+        ggml_backend_blas_set_n_threads(blas_backend, params.n_threads);
+        model.backends.push_back(blas_backend);
+    }
+#endif
+
     // always add the CPU backend as a fallback
     ggml_backend_t cpu_backend = ggml_backend_cpu_init();
     ggml_backend_cpu_set_n_threads(cpu_backend, params.n_threads);