Skip to content

Commit 0d08588

Browse files
committed
initial pass
1 parent 36696f3 commit 0d08588

File tree

12 files changed

+317502
-0
lines changed

12 files changed

+317502
-0
lines changed

TODO

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
```
2+
./make -j8
3+
./make o//llama.cpp/embedr/embedr
4+
./o/llama.cpp/embedr/embedr --version
5+
./o/llama.cpp/embedr/embedr
6+
```

llama.cpp/BUILD.mk

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ include llama.cpp/server/BUILD.mk
2626
include llama.cpp/main/BUILD.mk
2727
include llama.cpp/imatrix/BUILD.mk
2828
include llama.cpp/quantize/BUILD.mk
29+
include llama.cpp/embedr/BUILD.mk
2930
include llama.cpp/perplexity/BUILD.mk
3031
include llama.cpp/llama-bench/BUILD.mk
3132

@@ -89,6 +90,7 @@ $(LLAMA_CPP_OBJS): llama.cpp/BUILD.mk
8990
o/$(MODE)/llama.cpp: \
9091
o/$(MODE)/llama.cpp/main \
9192
o/$(MODE)/llama.cpp/llava \
93+
o/$(MODE)/llama.cpp/embedr \
9294
o/$(MODE)/llama.cpp/server \
9395
o/$(MODE)/llama.cpp/imatrix \
9496
o/$(MODE)/llama.cpp/quantize \

llama.cpp/embedr/BUILD.mk

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
2+
#── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
3+
4+
PKGS += LLAMA_CPP_EMBEDR
5+
6+
LLAMA_CPP_EMBEDR_FILES := $(wildcard llama.cpp/embedr/*)
7+
LLAMA_CPP_EMBEDR_HDRS = $(filter %.h,$(LLAMA_CPP_EMBEDR_FILES))
8+
LLAMA_CPP_EMBEDR_SRCS_C = $(filter %.c,$(LLAMA_CPP_EMBEDR_FILES))
9+
LLAMA_CPP_EMBEDR_SRCS_CPP = $(filter %.cpp,$(LLAMA_CPP_EMBEDR_FILES))
10+
LLAMA_CPP_EMBEDR_SRCS = $(LLAMA_CPP_EMBEDR_SRCS_C) $(LLAMA_CPP_EMBEDR_SRCS_CPP)
11+
12+
LLAMA_CPP_EMBEDR_OBJS = \
13+
$(LLAMA_CPP_EMBEDR_SRCS_C:%.c=o/$(MODE)/%.o) \
14+
$(LLAMA_CPP_EMBEDR_SRCS_CPP:%.cpp=o/$(MODE)/%.o)
15+
16+
17+
o/$(MODE)/llama.cpp/embedr/embedr.a: $(LLAMA_CPP_EMBEDR_SRCS_C)
18+
19+
o/$(MODE)/llama.cpp/embedr/sqlite3.o: llama.cpp/embedr/sqlite3.c
20+
o/$(MODE)/llama.cpp/embedr/sqlite3.a: o/$(MODE)/llama.cpp/embedr/sqlite3.o
21+
22+
o/$(MODE)/llama.cpp/embedr/sqlite-vec.o: llama.cpp/embedr/sqlite-vec.c
23+
o/$(MODE)/llama.cpp/embedr/sqlite-vec.a: o/$(MODE)/llama.cpp/embedr/sqlite-vec.o
24+
25+
o/$(MODE)/llama.cpp/embedr/shell.o: llama.cpp/embedr/shell.c
26+
o/$(MODE)/llama.cpp/embedr/shell.a: o/$(MODE)/llama.cpp/embedr/shell.o
27+
28+
#o/$(MODE)/llama.cpp/embedr/embedr.a: $(LLAMA_CPP_EMBEDR_OBJS)
29+
30+
#o/$(MODE)/llama.cpp/embedr/sqlite3.o: private COPTS += -O3
31+
32+
o/$(MODE)/llama.cpp/embedr/embedr: \
33+
o/$(MODE)/llama.cpp/embedr/shell.a \
34+
o/$(MODE)/llama.cpp/embedr/embedr.o \
35+
o/$(MODE)/llama.cpp/embedr/embedr.1.asc.zip.o \
36+
o/$(MODE)/llama.cpp/llama.cpp.a \
37+
o/$(MODE)/llama.cpp/embedr/sqlite3.a \
38+
o/$(MODE)/llama.cpp/embedr/sqlite-vec.a
39+
40+
$(LLAMA_CPP_EMBEDR_OBJS): private CCFLAGS += -DSQLITE_CORE
41+
42+
.PHONY: o/$(MODE)/llama.cpp/embedr
43+
o/$(MODE)/llama.cpp/embedr: \
44+
o/$(MODE)/llama.cpp/embedr/embedr
45+
46+
$(LLAMA_CPP_EMBEDR_OBJS): llama.cpp/BUILD.mk llama.cpp/embedr/BUILD.mk

llama.cpp/embedr/embedr.1

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
.Dd December 5, 2023
2+
.Dt LLAMAFILE-QUANTIZE 1
3+
.Os Llamafile Manual
4+
.Sh NAME
5+
.Nm llamafile-quantize
6+
.Nd large language model quantizer
7+
.Sh SYNOPSIS
8+
.Nm
9+
.Op flags...
10+
.Ar model-f32.gguf
11+
.Op Ar model-quant.gguf
12+
.Ar type
13+
.Op Ar nthreads
14+
.Sh DESCRIPTION
15+
.Nm
16+
converts large language model weights from the float32 or float16
17+
formats into smaller data types from 2 to 8 bits in size.
18+
.Sh OPTIONS
19+
The following flags are available:
20+
.Bl -tag -width indent
21+
.It Fl Fl allow-requantize
22+
Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit
23+
.It Fl Fl leave-output-tensor
24+
Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing
25+
.It Fl Fl pure
26+
Disable k-quant mixtures and quantize all tensors to the same type
27+
.El
28+
.Sh ARGUMENTS
29+
The following positional arguments are accepted:
30+
.Bl -tag -width indent
31+
.It Ev Ar model-f32.gguf
32+
Is the input file, which contains the unquantized model weights in either the float32 or float16 format.
33+
.It Ev Ar model-quant.gguf
34+
Is the output file, which will contain quantized weights in the desired format. If this path isn't specified, it'll default to [inp path]/ggml-model-[ftype].gguf.
35+
.It Ev Ar type
36+
Is the desired quantization format, which may be the integer id of a supported quantization type, or its name. See the quantization types section below for acceptable formats.
37+
.It Ev Ar nthreads
38+
Number of threads to use during computation (default: nproc/2)
39+
.El
40+
.Sh QUANTIZATION TYPES
41+
The following quantization types are available. This table shows the ID
42+
of the quantization format, its name, the file size of 7B model weights
43+
that use it, and finally the amount of quality badness it introduces as
44+
measured by the llamafile-perplexity tool averaged over 128 chunks with
45+
the TinyLLaMA 1.1B v1.0 Chat model. Rows are ordered in accordance with
46+
how recommended the quantization format is for general usage.
47+
.Pp
48+
.Bl -dash -compact
49+
.It
50+
  18 Q6_K 5.6gb +0.0446 ppl (q6 kawrakow)
51+
.It
52+
   7 Q8_0 7.2gb +0.0022 ppl (q8 gerganov)
53+
.It
54+
   1 F16 14gb +0.0000 ppl (best but biggest)
55+
.It
56+
   8 Q5_0 4.7gb +0.0817 ppl (q5 gerganov zero)
57+
.It
58+
  17 Q5_K_M 4.8gb +0.0836 ppl (q5 kawrakow medium)
59+
.It
60+
  16 Q5_K_S 4.7gb +0.1049 ppl (q5 kawrakow small)
61+
.It
62+
  15 Q4_K_M 4.1gb +0.3132 ppl (q4 kawrakow medium)
63+
.It
64+
  14 Q4_K_S 3.9gb +0.3408 ppl (q4 kawrakow small)
65+
.It
66+
  13 Q3_K_L 3.6gb +0.5736 ppl (q3 kawrakow large)
67+
.It
68+
  12 Q3_K_M 3.3gb +0.7612 ppl (q3 kawrakow medium)
69+
.It
70+
  11 Q3_K_S 3.0gb +1.3834 ppl (q3 kawrakow small)
71+
.It
72+
  10 Q2_K 2.6gb +4.2359 ppl (tiniest hallucinates most)
73+
.It
74+
  32 BF16 14gb +0.0000 ppl (canonical but cpu/cuda only)
75+
.It
76+
   0 F32 27gb 9.0952 ppl (reference point)
77+
.It
78+
   2 Q4_0 3.9gb +0.3339 ppl (legacy)
79+
.It
80+
   3 Q4_1 4.3gb +0.4163 ppl (legacy)
81+
.It
82+
   9 Q5_1 5.1gb +0.1091 ppl (legacy)
83+
.It
84+
  12 Q3_K alias for Q3_K_M
85+
.It
86+
  15 Q4_K alias for Q4_K_M
87+
.It
88+
  17 Q5_K alias for Q5_K_M
89+
.It
90+
COPY Only copy tensors, no quantizing.
91+
.El
92+
.Sh SEE ALSO
93+
.Xr llamafile 1 ,
94+
.Xr llamafile-imatrix 1 ,
95+
.Xr llamafile-perplexity 1 ,
96+
.Xr llava-quantize 1 ,
97+
.Xr zipalign 1 ,
98+
.Xr unzip 1

llama.cpp/embedr/embedr.1.asc

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
LLAMAFILE-QUANTIZE(1) General Commands Manual LLAMAFILE-QUANTIZE(1)
2+
3+
NAME
4+
llamafile-quantize — large language model quantizer
5+
6+
SYNOPSIS
7+
llamafile-quantize [flags...] model-f32.gguf [model-quant.gguf] type
8+
[nthreads]
9+
10+
DESCRIPTION
11+
llamafile-quantize converts large language model weights from the
12+
float32 or float16 formats into smaller data types from 2 to 8 bits in
13+
size.
14+
15+
OPTIONS
16+
The following flags are available:
17+
18+
--allow-requantize
19+
Allows requantizing tensors that have already been quantized.
20+
Warning: This can severely reduce quality compared to quantiz‐
21+
ing from 16bit or 32bit
22+
23+
--leave-output-tensor
24+
Will leave output.weight un(re)quantized. Increases model size
25+
but may also increase quality, especially when requantizing
26+
27+
--pure Disable k-quant mixtures and quantize all tensors to the same
28+
type
29+
30+
ARGUMENTS
31+
The following positional arguments are accepted:
32+
33+
model-f32.gguf
34+
Is the input file, which contains the unquantized model weights
35+
in either the float32 or float16 format.
36+
37+
model-quant.gguf
38+
Is the output file, which will contain quantized weights in the
39+
desired format. If this path isn't specified, it'll default to
40+
[inp path]/ggml-model-[ftype].gguf.
41+
42+
type Is the desired quantization format, which may be the integer id
43+
of a supported quantization type, or its name. See the quanti‐
44+
zation types section below for acceptable formats.
45+
46+
nthreads
47+
Number of threads to use during computation (default: nproc/2)
48+
49+
QUANTIZATION TYPES
50+
The following quantization types are available. This table shows the ID
51+
of the quantization format, its name, the file size of 7B model weights
52+
that use it, and finally the amount of quality badness it introduces as
53+
measured by the llamafile-perplexity tool averaged over 128 chunks with
54+
the TinyLLaMA 1.1B v1.0 Chat model. Rows are ordered in accordance with
55+
how recommended the quantization format is for general usage.
56+
57+
- 18 Q6_K 5.6gb +0.0446 ppl (q6 kawrakow)
58+
- 7 Q8_0 7.2gb +0.0022 ppl (q8 gerganov)
59+
- 1 F16 14gb +0.0000 ppl (best but biggest)
60+
- 8 Q5_0 4.7gb +0.0817 ppl (q5 gerganov zero)
61+
- 17 Q5_K_M 4.8gb +0.0836 ppl (q5 kawrakow medium)
62+
- 16 Q5_K_S 4.7gb +0.1049 ppl (q5 kawrakow small)
63+
- 15 Q4_K_M 4.1gb +0.3132 ppl (q4 kawrakow medium)
64+
- 14 Q4_K_S 3.9gb +0.3408 ppl (q4 kawrakow small)
65+
- 13 Q3_K_L 3.6gb +0.5736 ppl (q3 kawrakow large)
66+
- 12 Q3_K_M 3.3gb +0.7612 ppl (q3 kawrakow medium)
67+
- 11 Q3_K_S 3.0gb +1.3834 ppl (q3 kawrakow small)
68+
- 10 Q2_K 2.6gb +4.2359 ppl (tiniest hallucinates most)
69+
- 32 BF16 14gb +0.0000 ppl (canonical but cpu/cuda only)
70+
- 0 F32 27gb 9.0952 ppl (reference point)
71+
- 2 Q4_0 3.9gb +0.3339 ppl (legacy)
72+
- 3 Q4_1 4.3gb +0.4163 ppl (legacy)
73+
- 9 Q5_1 5.1gb +0.1091 ppl (legacy)
74+
- 12 Q3_K alias for Q3_K_M
75+
- 15 Q4_K alias for Q4_K_M
76+
- 17 Q5_K alias for Q5_K_M
77+
- COPY Only copy tensors, no quantizing.
78+
79+
SEE ALSO
80+
llamafile(1), llamafile-imatrix(1), llamafile-perplexity(1),
81+
llava-quantize(1), zipalign(1), unzip(1)
82+
83+
Llamafile Manual December 5, 2023 LLAMAFILE-QUANTIZE(1)

llama.cpp/embedr/embedr.c

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
2+
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
3+
#include "llama.cpp/llama.h"
4+
#include "llamafile/version.h"
5+
#include "llama.cpp/embedr/sqlite3.h"
6+
#include "llama.cpp/embedr/sqlite-vec.h"
7+
#include "llama.cpp/embedr/shell.h"
8+
#include "string.h"
9+
int main(int argc, char ** argv) {
10+
int rc;
11+
sqlite3* db;
12+
sqlite3_stmt* stmt;
13+
rc = sqlite3_auto_extension((void (*)())sqlite3_vec_init);
14+
15+
if(argc > 1 && (strcmp(argv[1], "sh") == 0)) {
16+
return mn(argc, argv);
17+
}
18+
printf("%d\n", argc);
19+
printf("llamafile-embed %s, SQLite %s, sqlite-vec=%s, %d\n", LLAMAFILE_VERSION_STRING, sqlite3_version, SQLITE_VEC_VERSION, LLAMA_FTYPE_MOSTLY_Q4_1);
20+
21+
rc = sqlite3_open(":memory:", &db);
22+
if(rc != SQLITE_OK) {
23+
printf("x\n");
24+
return 1;
25+
}
26+
27+
rc = sqlite3_prepare_v2(db, "select vec_version()", -1, &stmt, NULL);
28+
if(rc != SQLITE_OK) {
29+
printf("a\n");
30+
return 1;
31+
}
32+
rc = sqlite3_step(stmt);
33+
if(rc != SQLITE_ROW) {
34+
printf("b\n");
35+
sqlite3_finalize(stmt);
36+
return 1;
37+
}
38+
printf("x=%s\n", sqlite3_column_text(stmt, 0));
39+
40+
sqlite3_finalize(stmt);
41+
42+
43+
return 0;
44+
}

0 commit comments

Comments
 (0)