Skip to content

Commit 9713c93

Browse files
Merge pull request #19 from ggml-org/master
Merging from upstream
2 parents d33545c + 3f9da22 commit 9713c93

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+1881
-3316
lines changed

common/arg.cpp

Lines changed: 644 additions & 71 deletions
Large diffs are not rendered by default.

common/common.cpp

Lines changed: 14 additions & 519 deletions
Large diffs are not rendered by default.

common/common.h

Lines changed: 13 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,6 @@ struct common_grammar_trigger {
121121
common_grammar_trigger_type type;
122122
std::string value;
123123
llama_token token = LLAMA_TOKEN_NULL;
124-
125-
// T can only be nlohmann::ordered_json
126-
template <class T> T to_json() const;
127-
template <class T> static common_grammar_trigger from_json(const T & in);
128124
};
129125

130126
// sampling parameters
@@ -184,6 +180,13 @@ struct common_params_sampling {
184180
std::string print() const;
185181
};
186182

183+
struct common_params_model {
184+
std::string path = ""; // model local path // NOLINT
185+
std::string url = ""; // model url to download // NOLINT
186+
std::string hf_repo = ""; // HF repo // NOLINT
187+
std::string hf_file = ""; // HF file // NOLINT
188+
};
189+
187190
struct common_params_speculative {
188191
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
189192

@@ -197,19 +200,11 @@ struct common_params_speculative {
197200
struct cpu_params cpuparams;
198201
struct cpu_params cpuparams_batch;
199202

200-
std::string hf_repo = ""; // HF repo // NOLINT
201-
std::string hf_file = ""; // HF file // NOLINT
202-
203-
std::string model = ""; // draft model for speculative decoding // NOLINT
204-
std::string model_url = ""; // model url to download // NOLINT
203+
struct common_params_model model;
205204
};
206205

207206
struct common_params_vocoder {
208-
std::string hf_repo = ""; // HF repo // NOLINT
209-
std::string hf_file = ""; // HF file // NOLINT
210-
211-
std::string model = ""; // model path // NOLINT
212-
std::string model_url = ""; // model url to download // NOLINT
207+
struct common_params_model model;
213208

214209
std::string speaker_file = ""; // speaker file path // NOLINT
215210

@@ -267,12 +262,10 @@ struct common_params {
267262
struct common_params_speculative speculative;
268263
struct common_params_vocoder vocoder;
269264

270-
std::string model = ""; // model path // NOLINT
265+
struct common_params_model model;
266+
271267
std::string model_alias = ""; // model alias // NOLINT
272-
std::string model_url = ""; // model url to download // NOLINT
273268
std::string hf_token = ""; // HF token // NOLINT
274-
std::string hf_repo = ""; // HF repo // NOLINT
275-
std::string hf_file = ""; // HF file // NOLINT
276269
std::string prompt = ""; // NOLINT
277270
std::string system_prompt = ""; // NOLINT
278271
std::string prompt_file = ""; // store the external prompt file name // NOLINT
@@ -286,6 +279,7 @@ struct common_params {
286279
std::vector<std::string> in_files; // all input files
287280
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
288281
std::vector<llama_model_kv_override> kv_overrides;
282+
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
289283

290284
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
291285
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
@@ -347,7 +341,7 @@ struct common_params {
347341
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
348342

349343
// multimodal models (see examples/llava)
350-
std::string mmproj = ""; // path to multimodal projector // NOLINT
344+
struct common_params_model mmproj;
351345
std::vector<std::string> image; // path to image file(s)
352346

353347
// embedding
@@ -546,23 +540,6 @@ struct llama_model_params common_model_params_to_llama ( common_params
546540
struct llama_context_params common_context_params_to_llama(const common_params & params);
547541
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
548542

549-
struct llama_model * common_load_model_from_url(
550-
const std::string & model_url,
551-
const std::string & local_path,
552-
const std::string & hf_token,
553-
const struct llama_model_params & params);
554-
555-
struct llama_model * common_load_model_from_hf(
556-
const std::string & repo,
557-
const std::string & remote_path,
558-
const std::string & local_path,
559-
const std::string & hf_token,
560-
const struct llama_model_params & params);
561-
562-
std::pair<std::string, std::string> common_get_hf_file(
563-
const std::string & hf_repo_with_tag,
564-
const std::string & hf_token);
565-
566543
// clear LoRA adapters from context, then apply new list of adapters
567544
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
568545

common/minja/minja.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ class Value : public std::enable_shared_from_this<Value> {
240240
auto index = key.get<int>();
241241
return array_->at(index < 0 ? array_->size() + index : index);
242242
} else if (object_) {
243-
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
243+
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
244244
auto it = object_->find(key.primitive_);
245245
if (it == object_->end()) return Value();
246246
return it->second;
@@ -249,7 +249,7 @@ class Value : public std::enable_shared_from_this<Value> {
249249
}
250250
void set(const Value& key, const Value& value) {
251251
if (!object_) throw std::runtime_error("Value is not an object: " + dump());
252-
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
252+
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
253253
(*object_)[key.primitive_] = value;
254254
}
255255
Value call(const std::shared_ptr<Context> & context, ArgumentsValue & args) const {

convert_hf_to_gguf.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5146,10 +5146,7 @@ def set_vocab(self):
51465146
def set_gguf_parameters(self):
51475147
super().set_gguf_parameters()
51485148
hparams = self.hparams
5149-
if "head_dim" in hparams:
5150-
rope_dim = hparams["head_dim"]
5151-
else:
5152-
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
5149+
rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
51535150

51545151
self.gguf_writer.add_rope_dimension_count(rope_dim)
51555152
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
@@ -5175,7 +5172,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
51755172
n_head = self.hparams["num_attention_heads"]
51765173
n_kv_head = self.hparams.get("num_key_value_heads")
51775174
n_embd = self.hparams["hidden_size"]
5178-
head_dim = self.hparams.get("head_dim", n_embd // n_head)
5175+
head_dim = self.hparams.get("head_dim") or n_embd // n_head
51795176

51805177
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
51815178

docs/backend/SYCL.md

Lines changed: 7 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
2121

2222
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
23-
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
23+
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
2424
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
2525
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
2626

@@ -227,16 +227,6 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
227227

228228
**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
229229

230-
231-
**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
232-
233-
```sh
234-
git clone https://github.com/oneapi-src/oneMKL
235-
cd oneMKL
236-
cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
237-
cmake --build buildWithCublas --config Release
238-
```
239-
240230
**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
241231

242232
```sh
@@ -250,16 +240,6 @@ cmake --build build-nvidia --config Release
250240

251241
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
252242

253-
**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
254-
255-
```sh
256-
git clone https://github.com/oneapi-src/oneMKL
257-
cd oneMKL
258-
# Find your HIPTARGET with rocminfo, under the key 'Name:'
259-
cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
260-
cmake --build buildWithrocBLAS --config Release
261-
```
262-
263243
3. **Verify installation and environment**
264244

265245
In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
@@ -324,13 +304,10 @@ cmake --build build --config Release -j -v
324304

325305
#### Nvidia GPU
326306

327-
```sh
328-
# Export relevant ENV variables
329-
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
330-
export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
331-
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
332-
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
307+
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
308+
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
333309

310+
```sh
334311
# Build LLAMA with Nvidia BLAS acceleration through SYCL
335312
# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
336313
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
@@ -347,12 +324,10 @@ cmake --build build --config Release -j -v
347324

348325
#### AMD GPU
349326

350-
```sh
351-
# Export relevant ENV variables
352-
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
353-
export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
354-
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
327+
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
328+
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
355329

330+
```sh
356331
# Build LLAMA with rocBLAS acceleration through SYCL
357332

358333
## AMD

examples/batched-bench/batched-bench.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
3838

3939
llama_model_params model_params = common_model_params_to_llama(params);
4040

41-
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
41+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
4242

4343
if (model == NULL) {
4444
fprintf(stderr , "%s: error: unable to load model\n" , __func__);

examples/batched/batched.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
4141

4242
llama_model_params model_params = common_model_params_to_llama(params);
4343

44-
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
44+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
4545

4646
if (model == NULL) {
4747
LOG_ERR("%s: error: unable to load model\n" , __func__);

examples/export-lora/export-lora.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ int main(int argc, char ** argv) {
421421

422422
g_verbose = (params.verbosity > 1);
423423
try {
424-
lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
424+
lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
425425
ctx.run_merge();
426426
} catch (const std::exception & err) {
427427
fprintf(stderr, "%s\n", err.what());

examples/gritlm/gritlm.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {
168168

169169
llama_backend_init();
170170

171-
llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
171+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
172172

173173
// create generation context
174174
llama_context * ctx = llama_init_from_model(model, cparams);

0 commit comments

Comments
 (0)