Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
62eeaaf
scripts : update sync
ggerganov Nov 15, 2024
a53ac6f
metal : fix minor string leaks (ggml/1004)
pminev Nov 1, 2024
787b66f
cmake : make it possible linking ggml as external lib (ggml/1003)
ykhrustalev Nov 2, 2024
85c678c
musa: workaround for Guilty Lockup in cleaning src0 (llama/10042)
yeahdongcn Oct 28, 2024
5d22f5b
ggml: Add POOL2D OP for GPU acceleration to the Vulkan backend in the…
cyzero-kim Oct 29, 2024
a0ea7d4
llama : refactor model loader with backend registry (llama/10026)
slaren Oct 30, 2024
39fdd62
ggml : add Q4_0_8_8 RISC-V GEMV and GEMM kernels (llama/10029)
xctan Oct 30, 2024
4cbca54
ggml : fix memory leaks when loading invalid gguf files (llama/10094)
slaren Oct 30, 2024
d378f19
kompute: add backend registry / device interfaces (llama/10045)
slp Oct 30, 2024
1812284
kompute: add mul_mat_q4_k shader (llama/10097)
slp Oct 31, 2024
1c83752
ggml : check tensor name lengths in gguf files (llama/10100)
slaren Oct 31, 2024
6b7f6be
llama : fix buffer checks for mamba and rwk (llama/10111)
slaren Oct 31, 2024
72cbb25
build: fix build error in Windows env with OneAPI setup (llama/10107)
kylo5aby Nov 1, 2024
6352fcd
ggml : remove ggml_scratch (llama/10121)
ggerganov Nov 1, 2024
c28c6e8
vulkan : improve ggml_vk_create_buffer error handling (llama/9898)
FanShupei Nov 1, 2024
c7c5a95
llama : use smart pointers for ggml resources (llama/10117)
slaren Nov 1, 2024
749d287
llama : add simple-chat example (llama/10124)
slaren Nov 1, 2024
384ee00
metal : minor fixup in FA kernel (llama/10143)
ggerganov Nov 3, 2024
63f7286
ggml : move CPU backend to a separate file (llama/10144)
slaren Nov 3, 2024
fa240b2
CANN: adjust backend registry refactor. (llama/10158)
leo-pony Nov 4, 2024
e75a453
metal : move dequantize templates to beginning of MSL source (llama/0)
ggerganov Nov 4, 2024
e72fc8a
metal : simplify f16 and f32 dequant kernels (llama/0)
ggerganov Nov 4, 2024
801fdc2
cuda : clear error after changing peer access (llama/10153)
slaren Nov 4, 2024
03b75f4
fix build break on arm64 linux (llama/10166)
snadampal Nov 4, 2024
c7655fe
ggml : fix q4xx mat mul, increase ggml_aligned_malloc alignment (llam…
slaren Nov 4, 2024
45ecfd9
ggml : fix gelu tables initialization (llama/10172)
slaren Nov 4, 2024
26e5a75
Q6_K AVX improvements (llama/10118)
netrunnereve Nov 4, 2024
7580d7e
ggml : fix arch check in bf16_to_fp32 (llama/10164)
slaren Nov 4, 2024
8aaa1e9
metal : add quantized FA support (llama/10149)
ggerganov Nov 6, 2024
354191f
ggml : adjust is_first_call init value (llama/10193)
ggerganov Nov 6, 2024
406c648
metal : fix from ptr buffer name (llama/10189)
slaren Nov 6, 2024
42cbece
metal : add BF16 support (llama/8439)
ggerganov Nov 6, 2024
b8fbee4
Optimize RWKV6 Operator Naming and Implement Multi-core CPU/ SYCL Acc…
zhiyuan1i Nov 7, 2024
3cae70b
fix q4_0_8_8 format for corrupted tokens issue (llama/10198)
snadampal Nov 7, 2024
44c0abf
ggml : add ggml-cpu.h to the public headers (llama/10204)
slaren Nov 7, 2024
d6fb23b
metal : optimize FA kernels (llama/10171)
ggerganov Nov 8, 2024
2b11c93
metal : improve clarity (minor) (llama/10171)
ggerganov Nov 8, 2024
6998ecf
metal : opt-in compile flag for BF16 (llama/10218)
ggerganov Nov 8, 2024
f48abc2
ggml : optimize llamafile cpu matrix multiplication for ppc64le (llam…
amritahs-ibm Nov 9, 2024
d54b0d2
ggml: fix zero division in ‘dne’ calculation in CUDA COUNT_EQUAL oper…
SongXiaoXi Nov 9, 2024
065fc31
metal : hide debug messages from normal log
ggerganov Nov 9, 2024
be6999e
metal : fix F32 accumulation in FA vec kernel (llama/10232)
ggerganov Nov 9, 2024
e47d0eb
metal : fix build and some more comments (llama/10229)
ggerganov Nov 9, 2024
8536022
metal : reorder write loop in mul mat kernel + style (llama/10231)
ggerganov Nov 9, 2024
db5507a
vulkan: Fix newly added tests for permuted mul_mat and 1D im2col (lla…
jeffbolznv Nov 10, 2024
c4c4d88
metal : more precise Q*K in FA vec kernel (llama/10247)
ggerganov Nov 11, 2024
b606ad2
vulkan: Throttle the number of shader compiles during the build step.…
jeffbolznv Nov 11, 2024
c60ae68
vulkan: Optimize contiguous copies (llama/10254)
jeffbolznv Nov 13, 2024
3b637d4
sycl : Fixes to broken builds and test-backend-ops (llama/10257)
Alcpz Nov 13, 2024
3c337b2
sync : ggml
ggerganov Nov 15, 2024
3df5e16
whisper : fix build (#0)
ggerganov Nov 15, 2024
d93631c
talk-llama : sync llama.cpp
ggerganov Nov 15, 2024
19927ad
build : fixes
ggerganov Nov 15, 2024
463849a
whisper : include ggml-cpu.h (#0)
ggerganov Nov 15, 2024
f94863e
cmake : fix ppc64 check (#0)
ggerganov Nov 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,7 @@ endif

OBJ_GGML += \
ggml/src/ggml.o \
ggml/src/ggml-cpu.o \
ggml/src/ggml-alloc.o \
ggml/src/ggml-backend.o \
ggml/src/ggml-quants.o \
Expand Down Expand Up @@ -916,6 +917,12 @@ ggml/src/ggml.o: \
ggml/include/ggml.h
$(CC) $(CFLAGS) -c $< -o $@

ggml/src/ggml-cpu.o: \
ggml/src/ggml-cpu.c \
ggml/include/ggml.h \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@

ggml/src/ggml-alloc.o: \
ggml/src/ggml-alloc.c \
ggml/include/ggml.h \
Expand Down
1 change: 1 addition & 0 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ let package = Package(
"ggml/src/ggml-aarch64.c",
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.cpp",
"ggml/src/ggml-cpu.c",
"ggml/src/ggml-quants.c",
"ggml/src/ggml-metal.m"
],
Expand Down
1 change: 1 addition & 0 deletions bindings/ruby/ext/extconf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@

$OBJ_GGML <<
'ggml.o' <<
'ggml-cpu.o' <<
'ggml-alloc.o' <<
'ggml-backend.o' <<
'ggml-quants.o' <<
Expand Down
2 changes: 2 additions & 0 deletions bindings/ruby/extsources.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
- ../../src/whisper.cpp
- ../../include/whisper.h
- ../../ggml/src/ggml.c
- ../../ggml/src/ggml-cpu.c
- ../../ggml/src/ggml-impl.h
- ../../ggml/src/ggml-aarch64.h
- ../../ggml/src/ggml-aarch64.c
Expand All @@ -18,6 +19,7 @@
- ../../ggml/include/ggml.h
- ../../ggml/include/ggml-alloc.h
- ../../ggml/include/ggml-backend.h
- ../../ggml/include/ggml-cpu.h
- ../../ggml/include/ggml-cuda.h
- ../../ggml/include/ggml-kompute.h
- ../../ggml/include/ggml-metal.h
Expand Down
104 changes: 6 additions & 98 deletions examples/talk-llama/llama-sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
}

static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
// TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
// TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
// if (k >= (int32_t)cur_p->size) {
// return;
// }
Expand Down Expand Up @@ -733,101 +733,6 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
};
}

// tail-free

struct llama_sampler_tail_free {
const float z;
const size_t min_keep;
};

static const char * llama_sampler_tail_free_name(const struct llama_sampler * /*smpl*/) {
return "tail-free";
}

static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_tail_free *) smpl->ctx;

if (ctx->z >= 1.0f || cur_p->size <= 2) {
return;
}

llama_sampler_softmax_impl(cur_p);

// Compute the first and second derivatives
std::vector<float> first_derivatives(cur_p->size - 1);
std::vector<float> second_derivatives(cur_p->size - 2);

for (size_t i = 0; i < first_derivatives.size(); ++i) {
first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
}
for (size_t i = 0; i < second_derivatives.size(); ++i) {
second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
}

// Calculate absolute value of second derivatives
for (size_t i = 0; i < second_derivatives.size(); ++i) {
second_derivatives[i] = std::abs(second_derivatives[i]);
}

// Normalize the second derivatives
{
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);

if (second_derivatives_sum > 1e-6f) {
for (float & value : second_derivatives) {
value /= second_derivatives_sum;
}
} else {
for (float & value : second_derivatives) {
value = 1.0f / second_derivatives.size();
}
}
}

float cum_sum = 0.0f;
size_t last_idx = cur_p->size;
for (size_t i = 0; i < second_derivatives.size(); ++i) {
cum_sum += second_derivatives[i];

// Check if the running sum is greater than z or if we have kept at least min_keep tokens
if (cum_sum > ctx->z && i >= ctx->min_keep) {
last_idx = i;
break;
}
}

// Resize the output vector to keep only the tokens above the tail location
cur_p->size = last_idx;
}

static struct llama_sampler * llama_sampler_tail_free_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx;
return llama_sampler_init_tail_free(ctx->z, ctx->min_keep);
}

static void llama_sampler_tail_free_free(struct llama_sampler * smpl) {
delete (llama_sampler_tail_free *) smpl->ctx;
}

static struct llama_sampler_i llama_sampler_tail_free_i = {
/* .name = */ llama_sampler_tail_free_name,
/* .accept = */ nullptr,
/* .apply = */ llama_sampler_tail_free_apply,
/* .reset = */ nullptr,
/* .clone = */ llama_sampler_tail_free_clone,
/* .free = */ llama_sampler_tail_free_free,
};

struct llama_sampler * llama_sampler_init_tail_free(float z, size_t min_keep) {
return new llama_sampler {
/* .iface = */ &llama_sampler_tail_free_i,
/* .ctx = */ new llama_sampler_tail_free {
/* .z = */ z,
/*. min_keep = */ min_keep,
},
};
}

// typical

struct llama_sampler_typical {
Expand Down Expand Up @@ -1971,8 +1876,11 @@ static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
const auto * ctx = (llama_sampler_dry *) smpl->ctx;

// nullptr is passed as vocab because it is only needed for raw sequence breaker processing, which we have already done and will be copying
auto * result = llama_sampler_init_dry(nullptr, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
llama_vocab dummy_vocab;

// dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
auto * result = llama_sampler_init_dry_impl(dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);

// Copy the state, including the processed breakers
{
auto * result_ctx = (llama_sampler_dry *) result->ctx;
Expand Down
Loading
Loading