Skip to content

Commit e63e542

Browse files
author
ochafik
committed
Merge remote-tracking branch 'origin/master' into tool-diffs
2 parents 329d943 + 2004644 commit e63e542

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+1115
-2508
lines changed

ci/run.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
5959
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
6060
# Enable sysman for correct memory reporting
6161
export ZES_ENABLE_SYSMAN=1
62+
# to circumvent precision issues on CPY operations
63+
export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"
6264
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
6365
fi
6466

common/arg.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "gguf.h" // for reading GGUF splits
22
#include "arg.h"
33

4+
#include "common.h"
45
#include "log.h"
56
#include "sampling.h"
67
#include "chat.h"
@@ -848,6 +849,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
848849
params.kv_overrides.back().key[0] = 0;
849850
}
850851

852+
if (!params.tensor_buft_overrides.empty()) {
853+
params.tensor_buft_overrides.push_back({nullptr, nullptr});
854+
}
855+
851856
if (params.reranking && params.embedding) {
852857
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
853858
}
@@ -2180,6 +2185,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21802185
exit(0);
21812186
}
21822187
));
2188+
add_opt(common_arg(
2189+
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
2190+
"override tensor buffer type", [](common_params & params, const std::string & value) {
2191+
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
2192+
if (buft_list.empty()) {
2193+
// enumerate all the devices and add their buffer types to the list
2194+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
2195+
auto * dev = ggml_backend_dev_get(i);
2196+
auto * buft = ggml_backend_dev_buffer_type(dev);
2197+
if (buft) {
2198+
buft_list[ggml_backend_buft_name(buft)] = buft;
2199+
}
2200+
}
2201+
}
2202+
2203+
for (const auto & override : string_split<std::string>(value, ',')) {
2204+
std::string::size_type pos = override.find('=');
2205+
if (pos == std::string::npos) {
2206+
throw std::invalid_argument("invalid value");
2207+
}
2208+
std::string tensor_name = override.substr(0, pos);
2209+
std::string buffer_type = override.substr(pos + 1);
2210+
2211+
if (buft_list.find(buffer_type) == buft_list.end()) {
2212+
printf("Available buffer types:\n");
2213+
for (const auto & it : buft_list) {
2214+
printf(" %s\n", ggml_backend_buft_name(it.second));
2215+
}
2216+
throw std::invalid_argument("unknown buffer type");
2217+
}
2218+
// FIXME: this leaks memory
2219+
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
2220+
}
2221+
}
2222+
));
21832223
add_opt(common_arg(
21842224
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
21852225
"number of layers to store in VRAM",

common/common.cpp

Lines changed: 10 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,6 @@
77

88
#include "common.h"
99
#include "log.h"
10-
// Change JSON_ASSERT from assert() to GGML_ASSERT:
11-
#define JSON_ASSERT GGML_ASSERT
12-
#include "json.hpp"
1310
#include "llama.h"
1411

1512
#include <algorithm>
@@ -56,8 +53,6 @@
5653
#pragma warning(disable: 4244 4267) // possible loss of data
5754
#endif
5855

59-
using json = nlohmann::ordered_json;
60-
6156
//
6257
// CPU utils
6358
//
@@ -1067,22 +1062,32 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
10671062
if (!params.devices.empty()) {
10681063
mparams.devices = params.devices.data();
10691064
}
1065+
10701066
if (params.n_gpu_layers != -1) {
10711067
mparams.n_gpu_layers = params.n_gpu_layers;
10721068
}
1069+
10731070
mparams.main_gpu = params.main_gpu;
10741071
mparams.split_mode = params.split_mode;
10751072
mparams.tensor_split = params.tensor_split;
10761073
mparams.use_mmap = params.use_mmap;
10771074
mparams.use_mlock = params.use_mlock;
10781075
mparams.check_tensors = params.check_tensors;
1076+
10791077
if (params.kv_overrides.empty()) {
10801078
mparams.kv_overrides = NULL;
10811079
} else {
10821080
GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
10831081
mparams.kv_overrides = params.kv_overrides.data();
10841082
}
10851083

1084+
if (params.tensor_buft_overrides.empty()) {
1085+
mparams.tensor_buft_overrides = NULL;
1086+
} else {
1087+
GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
1088+
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
1089+
}
1090+
10861091
return mparams;
10871092
}
10881093

@@ -1565,26 +1570,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
15651570

15661571
return result;
15671572
}
1568-
1569-
template <>
1570-
json common_grammar_trigger::to_json() const {
1571-
json out {
1572-
{"type", (int) type},
1573-
{"value", value},
1574-
};
1575-
if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
1576-
out["token"] = (int) token;
1577-
}
1578-
return out;
1579-
}
1580-
1581-
template <>
1582-
common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
1583-
common_grammar_trigger out;
1584-
out.type = (common_grammar_trigger_type) in.at("type").get<int>();
1585-
out.value = in.at("value").get<std::string>();
1586-
if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
1587-
out.token = (llama_token) in.at("token").get<int>();
1588-
}
1589-
return out;
1590-
}

common/common.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,6 @@ struct common_grammar_trigger {
121121
common_grammar_trigger_type type;
122122
std::string value;
123123
llama_token token = LLAMA_TOKEN_NULL;
124-
125-
// T can only be nlohmann::ordered_json
126-
template <class T> T to_json() const;
127-
template <class T> static common_grammar_trigger from_json(const T & in);
128124
};
129125

130126
// sampling parameters
@@ -283,6 +279,7 @@ struct common_params {
283279
std::vector<std::string> in_files; // all input files
284280
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
285281
std::vector<llama_model_kv_override> kv_overrides;
282+
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
286283

287284
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
288285
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale

common/minja/minja.hpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2606,14 +2606,18 @@ inline std::shared_ptr<Context> Context::builtins() {
26062606
auto & text = args.at("text");
26072607
return text.is_null() ? text : Value(strip(text.get<std::string>()));
26082608
}));
2609-
globals.set("lower", simple_function("lower", { "text" }, [](const std::shared_ptr<Context> &, Value & args) {
2610-
auto text = args.at("text");
2611-
if (text.is_null()) return text;
2612-
std::string res;
2613-
auto str = text.get<std::string>();
2614-
std::transform(str.begin(), str.end(), std::back_inserter(res), ::tolower);
2615-
return Value(res);
2616-
}));
2609+
auto char_transform_function = [](const std::string & name, const std::function<char(char)> & fn) {
2610+
return simple_function(name, { "text" }, [=](const std::shared_ptr<Context> &, Value & args) {
2611+
auto text = args.at("text");
2612+
if (text.is_null()) return text;
2613+
std::string res;
2614+
auto str = text.get<std::string>();
2615+
std::transform(str.begin(), str.end(), std::back_inserter(res), fn);
2616+
return Value(res);
2617+
});
2618+
};
2619+
globals.set("lower", char_transform_function("lower", ::tolower));
2620+
globals.set("upper", char_transform_function("upper", ::toupper));
26172621
globals.set("default", Value::callable([=](const std::shared_ptr<Context> &, ArgumentsValue & args) {
26182622
args.expectArgs("default", {2, 3}, {0, 1});
26192623
auto & value = args.args[0];

docs/backend/SYCL.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,10 @@ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -
302302
cmake --build build --config Release -j -v
303303
```
304304

305+
It is possible to come across some precision issues when running tests that stem from using faster
306+
instructions, which can be circumvented by setting the environment variable `SYCL_PROGRAM_COMPILE_OPTIONS`
307+
as `-cl-fp32-correctly-rounded-divide-sqrt`
308+
305309
#### Nvidia GPU
306310

307311
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
@@ -322,6 +326,9 @@ cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=
322326
cmake --build build --config Release -j -v
323327
```
324328

329+
It is possible to come across some precision issues when running tests that stem from using faster
330+
instructions, which can be circumvented by passing the `-fno-fast-math` flag to the compiler.
331+
325332
#### AMD GPU
326333

327334
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.

examples/parallel/parallel.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ int main(int argc, char ** argv) {
106106

107107
common_params params;
108108

109+
params.n_predict = 128;
110+
109111
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
110112
return 1;
111113
}

examples/server/server.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ struct slot_params {
134134

135135
auto grammar_triggers = json::array();
136136
for (const auto & trigger : sampling.grammar_triggers) {
137-
grammar_triggers.push_back(trigger.to_json<json>());
137+
server_grammar_trigger ct(std::move(trigger));
138+
grammar_triggers.push_back(ct.to_json());
138139
}
139140

140141
return json {
@@ -379,9 +380,9 @@ struct server_task {
379380
const auto grammar_triggers = data.find("grammar_triggers");
380381
if (grammar_triggers != data.end()) {
381382
for (const auto & t : *grammar_triggers) {
382-
auto ct = common_grammar_trigger::from_json(t);
383-
if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
384-
const auto & word = ct.value;
383+
server_grammar_trigger ct(t);
384+
if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
385+
const auto & word = ct.value.value;
385386
auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
386387
if (ids.size() == 1) {
387388
auto token = ids[0];
@@ -399,14 +400,14 @@ struct server_task {
399400
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
400401
}
401402
} else {
402-
if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) {
403-
SRV_DBG("Grammar trigger pattern: `%s`\n", ct.value.c_str());
404-
} else if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL) {
405-
SRV_DBG("Grammar trigger pattern full: `%s`\n", ct.value.c_str());
403+
if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) {
404+
SRV_DBG("Grammar trigger pattern: `%s`\n", ct.value.value.c_str());
405+
} else if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL) {
406+
SRV_DBG("Grammar trigger pattern full: `%s`\n", ct.value.value.c_str());
406407
} else {
407408
throw std::runtime_error("Unknown grammar trigger type");
408409
}
409-
params.sampling.grammar_triggers.push_back(ct);
410+
params.sampling.grammar_triggers.emplace_back(std::move(ct.value));
410411
}
411412
}
412413
}

examples/server/utils.hpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,32 @@ static T json_value(const json & body, const std::string & key, const T & defaul
5858

5959
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
6060

61+
// thin wrapper around common_grammar_trigger with (de)serialization functions
62+
struct server_grammar_trigger {
63+
common_grammar_trigger value;
64+
65+
server_grammar_trigger() = default;
66+
server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
67+
server_grammar_trigger(const json & in) {
68+
value.type = (common_grammar_trigger_type) in.at("type").get<int>();
69+
value.value = in.at("value").get<std::string>();
70+
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
71+
value.token = (llama_token) in.at("token").get<int>();
72+
}
73+
}
74+
75+
json to_json() const {
76+
json out {
77+
{"type", (int) value.type},
78+
{"value", value.value},
79+
};
80+
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
81+
out["token"] = (int) value.token;
82+
}
83+
return out;
84+
}
85+
};
86+
6187
//
6288
// tokenizer and input processing utils
6389
//
@@ -601,7 +627,8 @@ static json oaicompat_completion_params_parse(
601627
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
602628
auto grammar_triggers = json::array();
603629
for (const auto & trigger : chat_params.grammar_triggers) {
604-
grammar_triggers.push_back(trigger.to_json<json>());
630+
server_grammar_trigger ct(trigger);
631+
grammar_triggers.push_back(ct.to_json());
605632
}
606633
llama_params["grammar_triggers"] = grammar_triggers;
607634
llama_params["preserved_tokens"] = chat_params.preserved_tokens;

ggml/src/ggml-cann/CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,11 @@ if (CANN_INSTALL_DIR)
5151
${CANN_INSTALL_DIR}/acllib/include
5252
)
5353

54-
add_subdirectory(kernels)
5554
list(APPEND CANN_LIBRARIES
5655
ascendcl
5756
nnopbase
5857
opapi
5958
acl_op_compiler
60-
ascendc_kernels
6159
)
6260

6361
file(GLOB GGML_SOURCES_CANN "*.cpp")

0 commit comments

Comments
 (0)