ggml-org
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 0 deletions b/‎README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎build.zig‎
Lines changed: 36 additions & 34 deletions b/‎build.zig‎
Lines changed: 36 additions & 34 deletions
diff --git a/‎docs/gguf.md‎
Lines changed: 57 additions & 6 deletions b/‎docs/gguf.md‎
Lines changed: 57 additions & 6 deletions
diff --git a/‎examples/whisper/whisper.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/whisper/whisper.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/sync-llama-am.sh‎
Lines changed: 1 addition & 0 deletions b/‎scripts/sync-llama-am.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scripts/sync-llama.last‎
Lines changed: 1 addition & 1 deletion b/‎scripts/sync-llama.last‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/sync-llama.sh‎
Lines changed: 1 addition & 0 deletions b/‎scripts/sync-llama.sh‎
Lines changed: 1 addition & 0 deletions
@@ -51,6 +51,7 @@ option(GGML_METAL_NDEBUG            "ggml: disable Metal debugging"           OF
 option(GGML_METAL_SHADER_DEBUG      "ggml: compile Metal with -fno-fast-math" OFF)
 option(GGML_METAL_EMBED_LIBRARY     "ggml: embed Metal library"               OFF)
 option(GGML_RPC                     "ggml: use RPC"                           OFF)
+option(GGML_VULKAN                  "ggml: use Vulkan"                        OFF)
 
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
 
@@ -122,6 +122,12 @@ cmake -DGGML_METAL=ON -DBUILD_SHARED_LIBS=Off ..
 cmake -DGGML_CUBLAS=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.1/bin/nvcc ..
 ```
 
+## Using hipBLAS
+
+```bash
+cmake -DCMAKE_C_COMPILER="$(hipconfig -l)/clang" -DCMAKE_CXX_COMPILER="$(hipconfig -l)/clang++" -DGGML_HIPBLAS=ON
+```
+
 ## Using clBLAST
 
 ```bash
 
@@ -4,14 +4,9 @@ const builtin = @import("builtin");
 // Zig Version: 0.11.0
 // Zig Build Command: zig build
 // Zig Run Command: zig build -h
-//     zig build run_dolly-v2
-//     zig build run_gpt-2
 //     zig build run_gpt-j
-//     zig build run_gpt-neox
 //     zig build run_mnist
-//     zig build run_mpt
-//     zig build run_replit
-//     zig build run_starcoder
+//     zig build run_magika
 //     zig build run_test-grad0
 //     zig build run_test-mul-mat0
 //     zig build run_test-mul-mat2
@@ -25,33 +20,35 @@ const builtin = @import("builtin");
 //     zig build run_zig_test1
 //     zig build run_zig_test2
 //     zig build run_zig_test3
-pub fn build(b: *std.build.Builder) void {
+pub fn build(b: *std.Build) void {
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
     const lib = b.addStaticLibrary(.{
         .name = "ggml",
         .target = target,
         .optimize = optimize,
     });
-    lib.addIncludePath(.{ .path = "./include" });
-    lib.addIncludePath(.{ .path = "./include/ggml" });
-    lib.addCSourceFiles(&.{
+    lib.addIncludePath(b.path("./include"));
+    lib.addIncludePath(b.path("./include/ggml"));
+    lib.addCSourceFiles(.{ .files = &.{
         "src/ggml.c",
-    }, &.{"-std=c11"});
+        "src/ggml-alloc.c",
+        "src/ggml-backend.c",
+        "src/ggml-quants.c",
+    }, .flags = &.{
+        "-std=c11",
+        "-D_GNU_SOURCE",
+        "-D_XOPEN_SOURCE=600",
+    } });
     lib.linkLibC();
     lib.linkLibCpp();
     b.installArtifact(lib);
 
     // examples
     const examples = .{
-        "dolly-v2",
-        "gpt-2",
         "gpt-j",
-        "gpt-neox",
+        "magika",
         "mnist",
-        "mpt",
-        "replit",
-        "starcoder",
         // "whisper",
     };
     inline for (examples) |name| {
@@ -60,16 +57,19 @@ pub fn build(b: *std.build.Builder) void {
             .target = target,
             .optimize = optimize,
         });
-        exe.addIncludePath(.{ .path = "./include" });
-        exe.addIncludePath(.{ .path = "./include/ggml" });
-        exe.addIncludePath(.{ .path = "./examples" });
+        exe.addIncludePath(b.path("./include"));
+        exe.addIncludePath(b.path("./include/ggml"));
+        exe.addIncludePath(b.path("./examples"));
         // exe.addIncludePath("./examples/whisper");
-        exe.addCSourceFiles(&.{
-            std.fmt.comptimePrint("examples/{s}/main.cpp", .{name}),
-            "examples/common.cpp",
-            "examples/common-ggml.cpp",
-            // "examples/whisper/whisper.cpp",
-        }, &.{"-std=c++11"});
+        exe.addCSourceFiles(.{
+            .files = &.{
+                std.fmt.comptimePrint("examples/{s}/main.cpp", .{name}),
+                "examples/common.cpp",
+                "examples/common-ggml.cpp",
+                // "examples/whisper/whisper.cpp",
+            },
+            .flags = &.{"-std=c++11"},
+        });
         exe.linkLibrary(lib);
         b.installArtifact(exe);
         const run_cmd = b.addRunArtifact(exe);
@@ -88,7 +88,7 @@ pub fn build(b: *std.build.Builder) void {
         "test-mul-mat2",
         // "test-opt",
         // "test-svd0",
-        // "test-vec0",
+        "test-vec0",
         "test-vec1",
         // "test-vec2",
         "test0",
@@ -117,11 +117,13 @@ pub fn build(b: *std.build.Builder) void {
             .target = target,
             .optimize = optimize,
         });
-        exe.addIncludePath(.{ .path = "./include" });
-        exe.addIncludePath(.{ .path = "./include/ggml" });
-        exe.addCSourceFiles(&.{
+        exe.addIncludePath(b.path("./include"));
+        exe.addIncludePath(b.path("./include/ggml"));
+        exe.addCSourceFiles(.{ .files = &.{
             std.fmt.comptimePrint("tests/{s}.c", .{name}),
-        }, &.{"-std=c11"});
+        }, .flags = &.{
+            "-std=c11",
+        } });
         exe.linkLibrary(lib);
         b.installArtifact(exe);
         const run_cmd = b.addRunArtifact(exe);
@@ -141,12 +143,12 @@ pub fn build(b: *std.build.Builder) void {
     inline for (zig_tests) |name| {
         const exe = b.addExecutable(.{
             .name = name,
-            .root_source_file = .{ .path = std.fmt.comptimePrint("tests/{s}.zig", .{name}) },
+            .root_source_file = b.path(std.fmt.comptimePrint("tests/{s}.zig", .{name})),
             .target = target,
             .optimize = optimize,
         });
-        exe.addIncludePath(.{ .path = "./include" });
-        exe.addIncludePath(.{ .path = "./include/ggml" });
+        exe.addIncludePath(b.path("./include"));
+        exe.addIncludePath(b.path("./include/ggml"));
         exe.linkLibrary(lib);
         b.installArtifact(exe);
         const run_cmd = b.addRunArtifact(exe);
 
@@ -20,40 +20,91 @@ The key difference between GGJT and GGUF is the use of a key-value structure for
 
 ### GGUF Naming Convention
 
-GGUF follow a naming convention of `<Model>-<Version>-<ExpertsCount>x<Parameters>-<EncodingScheme>.gguf`
+GGUF follow a naming convention of `<Model>(-<Version>)-(<ExpertsCount>x)<Parameters>-<EncodingScheme>(-<Shard>).gguf`
 
 The components are:
 1. **Model**: A descriptive name for the model type or architecture.
+    - This can be derived from gguf metadata `general.name` substituting spaces for dashes.
 2. **Version**: (Optional) Denotes the model version number, formatted as `v<Major>.<Minor>`
     - If model is missing a version number then assume `v0.0` (Prerelease)
-3. **ExpertsCount**: Indicates the number of experts found in a Mixture of Experts based model.
+    - This can be derived from gguf metadata `general.version`
+3. **ExpertsCount**: (Optional) Indicates the number of experts found in a Mixture of Experts based model.
+    - This can be derived from gguf metadata `llama.expert_count`
 4. **Parameters**: Indicates the number of parameters and their scale, represented as `<count><scale-prefix>`:
     - `Q`: Quadrillion parameters.
     - `T`: Trillion parameters.
     - `B`: Billion parameters.
     - `M`: Million parameters.
     - `K`: Thousand parameters.
 5. **EncodingScheme**: Indicates the weights encoding scheme that was applied to the model. Content, type mixture and arrangement however are determined by user code and can vary depending on project needs.
+6. **Shard**: (Optional) Indicates and denotes that the model has been split into multiple shards, formatted as `<ShardNum>-of-<ShardTotal>`.
+    - *ShardNum* : Shard position in this model. Must be 5 digits padded by zeros.
+      - Shard number always starts from `00001` onwards (e.g. First shard always starts at `00001-of-XXXXX` rather than `00000-of-XXXXX`).
+    - *ShardTotal* : Total number of shards in this model. Must be 5 digits padded by zeros.
 
 #### Parsing Above Naming Convention
 
 To correctly parse a well formed naming convention based gguf filename, it is recommended to read from right to left using `-` as the delimiter. This strategy allow for the most flexibility in model name to include dashes if they so choose, while at the same time allowing for version string to be optional. This approach also gives some future proofing to extend the format if needed in the future.
 
 For example:
 
-  * `mixtral-v0.1-8x7B-KQ2.gguf`:
+  * `Mixtral-v0.1-8x7B-Q2_K.gguf`:
     - Model Name: Mixtral
     - Version Number: v0.1
     - Expert Count: 8
     - Parameter Count: 7B
-    - Weight Encoding Scheme: KQ2
+    - Weight Encoding Scheme: Q2_K
+    - Shard: N/A
 
   * `Hermes-2-Pro-Llama-3-8B-F16.gguf`:
     - Model Name: Hermes 2 Pro Llama 3
-    - Version Number: v0.0 (`<Version>-` missing)
-    - Expert Count: 0 (`<ExpertsCount>x` missing)
+    - Version Number: v0.0
+    - Expert Count: 0
     - Parameter Count: 8B
     - Weight Encoding Scheme: F16
+    - Shard: N/A
+
+  * `Grok-v1.0-100B-Q4_0-00003-of-00009.gguf"`
+    - Model Name: Grok
+    - Version Number: v1.0
+    - Expert Count: 0
+    - Parameter Count: 100B
+    - Weight Encoding Scheme: Q4_0
+    - Shard: 3 out of 9 total shards
+
+You can also try using `/^(?<model_name>[A-Za-z0-9\s-]+)(?:-v(?<major>\d+)\.(?<minor>\d+))?-(?:(?<experts_count>\d+)x)?(?<parameters>\d+[A-Za-z]?)-(?<encoding_scheme>[\w_]+)(?:-(?<shard>\d{5})-of-(?<shardTotal>\d{5}))?\.gguf$/` regular expression to extract all the values above as well. Just don't forget to convert `-` to ` ` for the model name.
+
+<details><summary>Example Node.js Regex Function</summary>
+
+```js
+#!/usr/bin/env node
+const ggufRegex = /^(?<model_name>[A-Za-z0-9\s-]+)(?:-v(?<major>\d+)\.(?<minor>\d+))?-(?:(?<experts_count>\d+)x)?(?<parameters>\d+[A-Za-z]?)-(?<encoding_scheme>[\w_]+)(?:-(?<shard>\d{5})-of-(?<shardTotal>\d{5}))?\.gguf$/;
+
+function parseGGUFFilename(filename) {
+  const match = ggufRegex.exec(filename);
+  if (!match)
+    return null;
+  const {model_name, major = '0', minor = '0', experts_count = null, parameters, encoding_scheme, shard = null, shardTotal = null} = match.groups;
+  return {modelName: model_name.trim().replace(/-/g, ' '), version: `v${major}.${minor}`, expertsCount: experts_count ? +experts_count : null, parameters, encodingScheme: encoding_scheme, shard: shard ? +shard : null, shardTotal: shardTotal ? +shardTotal : null};
+}
+
+const testCases = [
+  {filename: 'Mixtral-v0.1-8x7B-Q2_K.gguf',              expected: { modelName: 'Mixtral',              version: 'v0.1',   expertsCount: 8,    parameters: '7B',   encodingScheme: 'Q2_K',  shard: null,    shardTotal: null }},
+  {filename: 'Grok-v1.0-100B-Q4_0-00003-of-00009.gguf', expected: { modelName: 'Grok',                 version: 'v1.0',   expertsCount: null, parameters: '100B', encodingScheme: 'Q4_0', shard: 3,       shardTotal: 9    }},
+  {filename: 'Hermes-2-Pro-Llama-3-8B-F16.gguf',        expected: { modelName: 'Hermes 2 Pro Llama 3', version: 'v0.0',   expertsCount: null, parameters: '8B',   encodingScheme: 'F16',  shard: null,    shardTotal: null }},
+  {filename: 'Hermes-2-Pro-Llama-3-v32.33-8Q-F16.gguf', expected: { modelName: 'Hermes 2 Pro Llama 3', version: 'v32.33', expertsCount: null, parameters: '8Q',   encodingScheme: 'F16',  shard: null,    shardTotal: null }},
+  {filename: 'not-a-known-arrangement.gguf', expected: null},
+];
+
+testCases.forEach(({ filename, expected }) => {
+  const result = parseGGUFFilename(filename);
+  const passed = JSON.stringify(result) === JSON.stringify(expected);
+  console.log(`${filename}: ${passed ? "PASS" : "FAIL"}`);
+});
+```
+
+</details>
+
 
 ### File Structure
 
 
@@ -2588,7 +2588,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
                         if (aheads_cross_QKs == NULL) {
                             aheads_cross_QKs = aheads_KQs;
                         } else {
-                            aheads_cross_QKs = ggml_concat(ctx0, aheads_cross_QKs, aheads_KQs);
+                            aheads_cross_QKs = ggml_concat(ctx0, aheads_cross_QKs, aheads_KQs, 2);
                         }
                     }
                 }
 
@@ -156,6 +156,7 @@ if [ -f $SRC_GGML/llama-src.patch ]; then
         -e 's/\/ggml-sycl\.h/\/src\/ggml-sycl.h/g' \
         -e 's/\/ggml-vulkan\.cpp/\/src\/ggml-vulkan.cpp/g' \
         -e 's/\/ggml-vulkan\.h/\/src\/ggml-vulkan.h/g' \
+        -e 's/\/ggml_vk_generate_shaders\.py/\/src\/ggml_vk_generate_shaders.py/g' \
         -e 's/\/ggml\.h/\/include\/ggml\/ggml.h/g' \
         -e 's/\/ggml-alloc\.h/\/include\/ggml\/ggml-alloc.h/g' \
         -e 's/\/ggml-backend\.h/\/include\/ggml\/ggml-backend.h/g' \
 
@@ -1 +1 @@
-e8a7fd4fb06d82f663850c21fcf86c0fb98ad9b4
+0e8d8bfd6caf1d0a8cbdf9d3d5c06fbbb9dfced8
@@ -26,6 +26,7 @@ cp -rpv ../llama.cpp/ggml-sycl.cpp       src/ggml-sycl.cpp
 cp -rpv ../llama.cpp/ggml-sycl.h         src/ggml-sycl.h
 cp -rpv ../llama.cpp/ggml-vulkan.cpp     src/ggml-vulkan.cpp
 cp -rpv ../llama.cpp/ggml-vulkan.h       src/ggml-vulkan.h
+cp -rpv ../llama.cpp/ggml_vk_generate_shaders.py       src/ggml_vk_generate_shaders.py
 cp -rpv ../llama.cpp/ggml.h              include/ggml/ggml.h
 cp -rpv ../llama.cpp/ggml-alloc.h        include/ggml/ggml-alloc.h
 cp -rpv ../llama.cpp/ggml-backend.h      include/ggml/ggml-backend.h
Original file line number	Diff line number	Diff line change
`@@ -2588,7 +2588,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(`
`2588`	`2588`	`if (aheads_cross_QKs == NULL) {`
`2589`	`2589`	`aheads_cross_QKs = aheads_KQs;`
`2590`	`2590`	`} else {`
`2591`		`- aheads_cross_QKs = ggml_concat(ctx0, aheads_cross_QKs, aheads_KQs);`
	`2591`	`+ aheads_cross_QKs = ggml_concat(ctx0, aheads_cross_QKs, aheads_KQs, 2);`
`2592`	`2592`	`}`
`2593`	`2593`	`}`
`2594`	`2594`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-e8a7fd4fb06d82f663850c21fcf86c0fb98ad9b4`
	`1`	`+0e8d8bfd6caf1d0a8cbdf9d3d5c06fbbb9dfced8`