ggml-org
diff --git a/‎ci/run.sh‎
Lines changed: 3 additions & 0 deletions b/‎ci/run.sh‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/llama-batch.cpp‎
Lines changed: 42 additions & 41 deletions b/‎src/llama-batch.cpp‎
Lines changed: 42 additions & 41 deletions
diff --git a/‎src/llama-batch.h‎
Lines changed: 2 additions & 2 deletions b/‎src/llama-batch.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/llama-context.cpp‎
Lines changed: 2 additions & 2 deletions b/‎src/llama-context.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/llama-grammar.cpp‎
Lines changed: 1 addition & 1 deletion b/‎src/llama-grammar.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/llama-mmap.cpp‎
Lines changed: 5 additions & 5 deletions b/‎src/llama-mmap.cpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/llama-model-loader.cpp‎
Lines changed: 8 additions & 8 deletions b/‎src/llama-model-loader.cpp‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/llama-model-loader.h‎
Lines changed: 1 addition & 1 deletion b/‎src/llama-model-loader.h‎
Lines changed: 1 addition & 1 deletion
@@ -13,6 +13,9 @@
 # # with SYCL support
 # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with METAL support
+# GG_BUILD_METAL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
 # # with VULKAN support
 # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 
@@ -7,9 +7,9 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
     // clear empty sequences
     // the previous ubatch is assumed to be gone,
     // so nothing should refer to values in these sequences anymore.
-    for (size_t i = seq.size(); i-- > 0;) {
-        if (seq[i].length == 0) {
-            seq.pop_back();
+    for (size_t i = seqs.size(); i-- > 0;) {
+        if (seqs[i].length == 0) {
+            seqs.pop_back();
         } else {
             break;
         }
@@ -36,48 +36,48 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
 }
 
 void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) {
-    GGML_ASSERT(batch != nullptr);
+    GGML_ASSERT(batch_ptr != nullptr);
     GGML_ASSERT(length <= seq.length);
     // Can only add sequences of equal lengths to a batch,
     // otherwise it isn't clear to which sequence a token belongs
     GGML_ASSERT(seq.n_seq_id == 0 || ubatch.n_seqs == 0 || length == (size_t) ubatch.n_tokens / ubatch.n_seqs);
     GGML_ASSERT((seq.n_seq_id != 0) == ubatch.equal_seqs);
     // NOTE: loops are separated for cache-friendliness
-    if (batch->token) {
+    if (batch_ptr->token) {
         if (ubatch.equal_seqs) {
             for (size_t i = 0; i < length; ++i) {
-                ubatch.token[ubatch.n_tokens + i] = batch->token[ids[seq.offset + i]];
+                ubatch.token[ubatch.n_tokens + i] = batch_ptr->token[ids[seq.offset + i]];
             }
         } else {
             // simple split
-            ubatch.token = batch->token + seq.offset;
+            ubatch.token = batch_ptr->token + seq.offset;
         }
     } else {
         ubatch.token = nullptr;
     }
-    if (batch->embd) {
+    if (batch_ptr->embd) {
         if (ubatch.equal_seqs) {
             for (size_t i = 0; i < length; ++i) {
                 memcpy(
                         ubatch.embd + (n_embd * (ubatch.n_tokens + i)),
-                        batch->embd + (n_embd * ids[seq.offset + i]),
+                        batch_ptr->embd + (n_embd * ids[seq.offset + i]),
                         n_embd * sizeof(float)
                       );
             }
         } else {
             // simple split
-            ubatch.embd = batch->embd + (n_embd * seq.offset);
+            ubatch.embd = batch_ptr->embd + (n_embd * seq.offset);
         }
     } else {
         ubatch.embd = nullptr;
     }
     if (ubatch.equal_seqs) {
         for (size_t i = 0; i < length; ++i) {
-            ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
+            ubatch.pos[ubatch.n_tokens + i] = batch_ptr->pos[ids[seq.offset + i]];
         }
     } else {
         // simple split
-        ubatch.pos = batch->pos + seq.offset;
+        ubatch.pos = batch_ptr->pos + seq.offset;
     }
     if (ubatch.equal_seqs) {
         ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
@@ -86,33 +86,33 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
         }
     } else {
         // simple split
-        if (batch->n_seq_id) {
-            ubatch.n_seq_id = batch->n_seq_id + seq.offset;
+        if (batch_ptr->n_seq_id) {
+            ubatch.n_seq_id = batch_ptr->n_seq_id + seq.offset;
         } else {
             for (size_t i = 0; i < length; ++i) {
                 ubatch.n_seq_id[ubatch.n_seqs + i] = 1;
             }
         }
-        if (batch->seq_id) {
-            ubatch.seq_id = batch->seq_id + seq.offset;
+        if (batch_ptr->seq_id) {
+            ubatch.seq_id = batch_ptr->seq_id + seq.offset;
         }
     }
     if (logits_all) {
         for (size_t i = 0; i < length; ++i) {
             ubatch.output[ubatch.n_tokens + i] = 1;
             out_ids.push_back(ids[seq.offset + i]);
         }
-    } else if (batch->logits) {
+    } else if (batch_ptr->logits) {
         if (ubatch.equal_seqs) {
             for (size_t i = 0; i < length; ++i) {
                 size_t id = ids[seq.offset + i];
-                int8_t is_output = batch->logits[id];
+                int8_t is_output = batch_ptr->logits[id];
                 ubatch.output[ubatch.n_tokens + i] = is_output;
                 if (is_output) { out_ids.push_back(id); }
             }
         } else {
             // simple split
-            ubatch.output = batch->logits + seq.offset;
+            ubatch.output = batch_ptr->logits + seq.offset;
             for (size_t i = 0; i < length; ++i) {
                 if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); }
             }
@@ -139,28 +139,28 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
 
 llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) {
     n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
-    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
+    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr);
     ubatch.equal_seqs = false;
-    if (!seq.empty()) {
-        llama_sbatch_seq & s = seq[0];
+    if (!seqs.empty()) {
+        llama_sbatch_seq & s = seqs[0];
         size_t length = s.length < n_ubatch ? s.length : n_ubatch;
-        GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
+        GGML_ASSERT(seqs.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
         add_seq_to_ubatch(ubatch, s, length);
     }
     return ubatch;
 }
 
 llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
     n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
-    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
-    if (!seq.empty()) {
+    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr);
+    if (!seqs.empty()) {
         size_t length = 0;
         size_t n_tokens_in_ubatch = 0;
-        GGML_ASSERT(seq[0].n_seq_id > 0); // should not be mixed with simple splits
+        GGML_ASSERT(seqs[0].n_seq_id > 0); // should not be mixed with simple splits
                                           // smallest first, because it's easier to split this way;
                                           // starting from the end to pop in constant time.
-        for (size_t i = seq.size(); i-- > 0;) {
-            llama_sbatch_seq & s = seq[i];
+        for (size_t i = seqs.size(); i-- > 0;) {
+            llama_sbatch_seq & s = seqs[i];
             GGML_ASSERT(s.length > 0);
             if (length == 0) {
                 length = s.length < n_ubatch ? s.length : n_ubatch;
@@ -179,33 +179,34 @@ llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
 
 llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
     n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
-    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
-    if (!seq.empty()) {
-        llama_sbatch_seq & s = seq[seq.size() - 1];
+    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr);
+    if (!seqs.empty()) {
+        llama_sbatch_seq & s = seqs.back();
         size_t length = s.length < n_ubatch ? s.length : n_ubatch;
         GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits
         add_seq_to_ubatch(ubatch, s, length);
     }
     return ubatch;
 }
 
-void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
+void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd_cur, bool simple_split, bool logits_all_cur) {
     GGML_ASSERT(batch.n_tokens >= 0);
-    this->batch = &batch;
-    this->n_embd = n_embd;
-    this->logits_all = logits_all;
+
+    batch_ptr = &batch;
+    n_embd = n_embd_cur;
+    logits_all = logits_all_cur;
 
     n_tokens = batch.n_tokens;
     ids.resize(n_tokens);
     out_ids.clear();
-    // TODO: reserve out_ids and seq
+    // TODO: reserve out_ids and seqs
 
     for (size_t i = 0; i < n_tokens; ++i) {
         ids[i] = i;
     }
     if (simple_split) {
-        seq.resize(1);
-        llama_sbatch_seq & s = seq[0];
+        seqs.resize(1);
+        llama_sbatch_seq & s = seqs[0];
         s.n_seq_id = 0;
         s.seq_id = nullptr;
         s.offset = 0;
@@ -259,11 +260,11 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
             }
         }
         llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
-        seq.push_back(new_seq);
-        last_seq = &seq.back();
+        seqs.push_back(new_seq);
+        last_seq = &seqs.back();
     }
     // keep shared prompts first at the end, then sort by length descending.
-    std::sort(seq.begin(), seq.end(),
+    std::sort(seqs.begin(), seqs.end(),
             [](llama_sbatch_seq & a, llama_sbatch_seq & b) {
                 if (a.n_seq_id == b.n_seq_id) {
                     return a.length > b.length;
 
@@ -45,9 +45,9 @@ struct llama_sbatch {
     std::vector<size_t> ids;
     // batch indices of the output
     std::vector<size_t> out_ids;
-    std::vector<llama_sbatch_seq> seq;
+    std::vector<llama_sbatch_seq> seqs;
 
-    const llama_batch * batch = nullptr;
+    const llama_batch * batch_ptr = nullptr;
 
     // buffers for the ubatch
     std::vector<llama_token>    ubatch_token;
 
@@ -916,8 +916,8 @@ struct llama_data_write {
                 write(&n_seq_id, sizeof(n_seq_id));
 
                 if (n_seq_id) {
-                    for (auto seq_id : cell.seq_id) {
-                        write(&seq_id, sizeof(seq_id));
+                    for (auto sid : cell.seq_id) {
+                        write(&sid, sizeof(sid));
                     }
                 }
             }
 
@@ -490,7 +490,7 @@ const char * llama_grammar_parser::parse_sequence(
                     pos = parse_space(pos + 1, is_nested);
 
                     if (is_digit_char(*pos)) {
-                        const char * int_end = parse_int(pos);
+                        int_end = parse_int(pos);
                         max_times = std::stoul(std::string(pos, int_end - pos));
                         pos = parse_space(int_end, is_nested);
                     }
 
@@ -454,8 +454,8 @@ struct llama_mlock::impl {
         return (size_t) sysconf(_SC_PAGESIZE);
     }
 
-    bool raw_lock(const void * addr, size_t size) const {
-        if (!mlock(addr, size)) {
+    bool raw_lock(const void * addr_cur, size_t size_cur) const {
+        if (!mlock(addr_cur, size_cur)) {
             return true;
         }
 
@@ -475,12 +475,12 @@ struct llama_mlock::impl {
         if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
             suggest = false;
         }
-        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
+        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size_cur)) {
             suggest = false;
         }
 
         LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
-                size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
+                size_cur, size, errmsg, suggest ? MLOCK_SUGGESTION : "");
         return false;
     }
 
@@ -535,7 +535,7 @@ struct llama_mlock::impl {
         return (size_t) 65536;
     }
 
-    bool raw_lock(const void * addr, size_t len) const {
+    bool raw_lock(const void * addr_cur, size_t size_cur) const {
         LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
         return false;
     }
 
@@ -413,7 +413,7 @@ namespace GGUFMeta {
     template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
 
-llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
+llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap_cur, bool check_tensors_cur, const struct llama_model_kv_override * param_overrides_p) {
     int trace = 0;
     if (getenv("LLAMA_TRACE")) {
         trace = atoi(getenv("LLAMA_TRACE"));
@@ -626,11 +626,11 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
 
     if (!llama_mmap::SUPPORTED) {
         LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
-        use_mmap = false;
+        use_mmap_cur = false;
     }
 
-    this->use_mmap = use_mmap;
-    this->check_tensors = check_tensors;
+    use_mmap = use_mmap_cur;
+    check_tensors = check_tensors_cur;
 }
 
 std::string llama_model_loader::get_arch_name() const {
@@ -887,15 +887,15 @@ bool llama_model_loader::load_all_data(
 
         // If the backend is supported, create pinned memory buffers and events for synchronisation.
         for (size_t idx = 0; idx < n_buffers; ++idx) {
-            auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
-            if (!buf) {
+            auto * buf_new = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
+            if (!buf_new) {
                 LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
                     ggml_backend_dev_name(dev));
                 return nullptr;
             }
 
-            host_buffers.emplace_back(buf);
-            host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
+            host_buffers.emplace_back(buf_new);
+            host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf_new));
 
             auto * event = ggml_backend_event_new(dev);
             if (!event) {
 
@@ -90,7 +90,7 @@ struct llama_model_loader {
     size_t size_data = 0;
     std::vector<std::pair<size_t, size_t>> mmaps_used;
 
-    llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p);
+    llama_model_loader(const std::string & fname, bool use_mmap_cur, bool check_tensors_cur, const struct llama_model_kv_override * param_overrides_p);
 
     template<typename T>
     typename std::enable_if<std::is_integral<T>::value, bool>::type
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,9 @@`
`13`	`13`	`# # with SYCL support`
`14`	`14`	`# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt`
`15`	`15`	`#`
	`16`	`+# # with METAL support`
	`17`	`+# GG_BUILD_METAL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt`
	`18`	`+#`
`16`	`19`	`# # with VULKAN support`
`17`	`20`	`# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt`
`18`	`21`	`#`
Original file line number	Diff line number	Diff line change
`@@ -916,8 +916,8 @@ struct llama_data_write {`
`916`	`916`	`write(&n_seq_id, sizeof(n_seq_id));`
`917`	`917`
`918`	`918`	`if (n_seq_id) {`
`919`		`- for (auto seq_id : cell.seq_id) {`
`920`		`- write(&seq_id, sizeof(seq_id));`
	`919`	`+ for (auto sid : cell.seq_id) {`
	`920`	`+ write(&sid, sizeof(sid));`
`921`	`921`	`}`
`922`	`922`	`}`
`923`	`923`	`}`
Original file line number	Diff line number	Diff line change
`@@ -490,7 +490,7 @@ const char * llama_grammar_parser::parse_sequence(`
`490`	`490`	`pos = parse_space(pos + 1, is_nested);`
`491`	`491`
`492`	`492`	`if (is_digit_char(*pos)) {`
`493`		`- const char * int_end = parse_int(pos);`
	`493`	`+ int_end = parse_int(pos);`
`494`	`494`	`max_times = std::stoul(std::string(pos, int_end - pos));`
`495`	`495`	`pos = parse_space(int_end, is_nested);`
`496`	`496`	`}`
Original file line number	Diff line number	Diff line change
`@@ -454,8 +454,8 @@ struct llama_mlock::impl {`
`454`	`454`	`return (size_t) sysconf(_SC_PAGESIZE);`
`455`	`455`	`}`
`456`	`456`
`457`		`- bool raw_lock(const void * addr, size_t size) const {`
`458`		`- if (!mlock(addr, size)) {`
	`457`	`+ bool raw_lock(const void * addr_cur, size_t size_cur) const {`
	`458`	`+ if (!mlock(addr_cur, size_cur)) {`
`459`	`459`	`return true;`
`460`	`460`	`}`
`461`	`461`
`@@ -475,12 +475,12 @@ struct llama_mlock::impl {`
`475`	`475`	`if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {`
`476`	`476`	`suggest = false;`
`477`	`477`	`}`
`478`		`- if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {`
	`478`	`+ if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size_cur)) {`
`479`	`479`	`suggest = false;`
`480`	`480`	`}`
`481`	`481`
`482`	`482`	`LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",`
`483`		`- size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");`
	`483`	`+ size_cur, size, errmsg, suggest ? MLOCK_SUGGESTION : "");`
`484`	`484`	`return false;`
`485`	`485`	`}`
`486`	`486`
`@@ -535,7 +535,7 @@ struct llama_mlock::impl {`
`535`	`535`	`return (size_t) 65536;`
`536`	`536`	`}`
`537`	`537`
`538`		`- bool raw_lock(const void * addr, size_t len) const {`
	`538`	`+ bool raw_lock(const void * addr_cur, size_t size_cur) const {`
`539`	`539`	`LLAMA_LOG_WARN("warning: mlock not supported on this system\n");`
`540`	`540`	`return false;`
`541`	`541`	`}`