mozilla-ai
diff --git a/‎TODO‎
Lines changed: 6 additions & 0 deletions b/‎TODO‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎llama.cpp/BUILD.mk‎
Lines changed: 2 additions & 0 deletions b/‎llama.cpp/BUILD.mk‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎llama.cpp/embedr/BUILD.mk‎
Lines changed: 46 additions & 0 deletions b/‎llama.cpp/embedr/BUILD.mk‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎llama.cpp/embedr/embedr.1‎
Lines changed: 98 additions & 0 deletions b/‎llama.cpp/embedr/embedr.1‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎llama.cpp/embedr/embedr.1.asc‎
Lines changed: 83 additions & 0 deletions b/‎llama.cpp/embedr/embedr.1.asc‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎llama.cpp/embedr/embedr.c‎
Lines changed: 44 additions & 0 deletions b/‎llama.cpp/embedr/embedr.c‎
Lines changed: 44 additions & 0 deletions
@@ -0,0 +1,6 @@
+```
+./make -j8
+ ./make o//llama.cpp/embedr/embedr
+ ./o/llama.cpp/embedr/embedr --version
+ ./o/llama.cpp/embedr/embedr
+```
@@ -26,6 +26,7 @@ include llama.cpp/server/BUILD.mk
 include llama.cpp/main/BUILD.mk
 include llama.cpp/imatrix/BUILD.mk
 include llama.cpp/quantize/BUILD.mk
+include llama.cpp/embedr/BUILD.mk
 include llama.cpp/perplexity/BUILD.mk
 include llama.cpp/llama-bench/BUILD.mk
 
@@ -89,6 +90,7 @@ $(LLAMA_CPP_OBJS): llama.cpp/BUILD.mk
 o/$(MODE)/llama.cpp: 					\
 		o/$(MODE)/llama.cpp/main		\
 		o/$(MODE)/llama.cpp/llava		\
+		o/$(MODE)/llama.cpp/embedr	 \
 		o/$(MODE)/llama.cpp/server		\
 		o/$(MODE)/llama.cpp/imatrix		\
 		o/$(MODE)/llama.cpp/quantize		\
 
@@ -0,0 +1,46 @@
+#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
+#── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
+
+PKGS += LLAMA_CPP_EMBEDR
+
+LLAMA_CPP_EMBEDR_FILES := $(wildcard llama.cpp/embedr/*)
+LLAMA_CPP_EMBEDR_HDRS = $(filter %.h,$(LLAMA_CPP_EMBEDR_FILES))
+LLAMA_CPP_EMBEDR_SRCS_C = $(filter %.c,$(LLAMA_CPP_EMBEDR_FILES))
+LLAMA_CPP_EMBEDR_SRCS_CPP = $(filter %.cpp,$(LLAMA_CPP_EMBEDR_FILES))
+LLAMA_CPP_EMBEDR_SRCS = $(LLAMA_CPP_EMBEDR_SRCS_C) $(LLAMA_CPP_EMBEDR_SRCS_CPP)
+
+LLAMA_CPP_EMBEDR_OBJS = \
+	$(LLAMA_CPP_EMBEDR_SRCS_C:%.c=o/$(MODE)/%.o) \
+	$(LLAMA_CPP_EMBEDR_SRCS_CPP:%.cpp=o/$(MODE)/%.o)
+
+
+o/$(MODE)/llama.cpp/embedr/embedr.a: $(LLAMA_CPP_EMBEDR_SRCS_C)
+
+o/$(MODE)/llama.cpp/embedr/sqlite3.o: llama.cpp/embedr/sqlite3.c
+o/$(MODE)/llama.cpp/embedr/sqlite3.a: o/$(MODE)/llama.cpp/embedr/sqlite3.o
+
+o/$(MODE)/llama.cpp/embedr/sqlite-vec.o: llama.cpp/embedr/sqlite-vec.c
+o/$(MODE)/llama.cpp/embedr/sqlite-vec.a: o/$(MODE)/llama.cpp/embedr/sqlite-vec.o
+
+o/$(MODE)/llama.cpp/embedr/shell.o: llama.cpp/embedr/shell.c
+o/$(MODE)/llama.cpp/embedr/shell.a: o/$(MODE)/llama.cpp/embedr/shell.o
+
+#o/$(MODE)/llama.cpp/embedr/embedr.a: $(LLAMA_CPP_EMBEDR_OBJS)
+
+#o/$(MODE)/llama.cpp/embedr/sqlite3.o: private COPTS += -O3
+
+o/$(MODE)/llama.cpp/embedr/embedr:					\
+		o/$(MODE)/llama.cpp/embedr/shell.a \
+		o/$(MODE)/llama.cpp/embedr/embedr.o			\
+		o/$(MODE)/llama.cpp/embedr/embedr.1.asc.zip.o	\
+		o/$(MODE)/llama.cpp/llama.cpp.a \
+		o/$(MODE)/llama.cpp/embedr/sqlite3.a \
+		o/$(MODE)/llama.cpp/embedr/sqlite-vec.a
+
+$(LLAMA_CPP_EMBEDR_OBJS): private CCFLAGS += -DSQLITE_CORE
+
+.PHONY: o/$(MODE)/llama.cpp/embedr
+o/$(MODE)/llama.cpp/embedr:						\
+		o/$(MODE)/llama.cpp/embedr/embedr
+
+$(LLAMA_CPP_EMBEDR_OBJS): llama.cpp/BUILD.mk llama.cpp/embedr/BUILD.mk
@@ -0,0 +1,98 @@
+.Dd December 5, 2023
+.Dt LLAMAFILE-QUANTIZE 1
+.Os Llamafile Manual
+.Sh NAME
+.Nm llamafile-quantize
+.Nd large language model quantizer
+.Sh SYNOPSIS
+.Nm
+.Op flags...
+.Ar model-f32.gguf
+.Op Ar model-quant.gguf
+.Ar type
+.Op Ar nthreads
+.Sh DESCRIPTION
+.Nm
+converts large language model weights from the float32 or float16
+formats into smaller data types from 2 to 8 bits in size.
+.Sh OPTIONS
+The following flags are available:
+.Bl -tag -width indent
+.It Fl Fl allow-requantize
+Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit
+.It Fl Fl leave-output-tensor
+Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing
+.It Fl Fl pure
+Disable k-quant mixtures and quantize all tensors to the same type
+.El
+.Sh ARGUMENTS
+The following positional arguments are accepted:
+.Bl -tag -width indent
+.It Ev Ar model-f32.gguf
+Is the input file, which contains the unquantized model weights in either the float32 or float16 format.
+.It Ev Ar model-quant.gguf
+Is the output file, which will contain quantized weights in the desired format. If this path isn't specified, it'll default to [inp path]/ggml-model-[ftype].gguf.
+.It Ev Ar type
+Is the desired quantization format, which may be the integer id of a supported quantization type, or its name. See the quantization types section below for acceptable formats.
+.It Ev Ar nthreads
+Number of threads to use during computation (default: nproc/2)
+.El
+.Sh QUANTIZATION TYPES
+The following quantization types are available. This table shows the ID
+of the quantization format, its name, the file size of 7B model weights
+that use it, and finally the amount of quality badness it introduces as
+measured by the llamafile-perplexity tool averaged over 128 chunks with
+the TinyLLaMA 1.1B v1.0 Chat model. Rows are ordered in accordance with
+how recommended the quantization format is for general usage.
+.Pp
+.Bl -dash -compact
+.It
+  18 Q6_K   5.6gb +0.0446 ppl (q6 kawrakow)
+.It
+   7 Q8_0   7.2gb +0.0022 ppl (q8 gerganov)
+.It
+   1 F16    14gb  +0.0000 ppl (best but biggest)
+.It
+   8 Q5_0   4.7gb +0.0817 ppl (q5 gerganov zero)
+.It
+  17 Q5_K_M 4.8gb +0.0836 ppl (q5 kawrakow medium)
+.It
+  16 Q5_K_S 4.7gb +0.1049 ppl (q5 kawrakow small)
+.It
+  15 Q4_K_M 4.1gb +0.3132 ppl (q4 kawrakow medium)
+.It
+  14 Q4_K_S 3.9gb +0.3408 ppl (q4 kawrakow small)
+.It
+  13 Q3_K_L 3.6gb +0.5736 ppl (q3 kawrakow large)
+.It
+  12 Q3_K_M 3.3gb +0.7612 ppl (q3 kawrakow medium)
+.It
+  11 Q3_K_S 3.0gb +1.3834 ppl (q3 kawrakow small)
+.It
+  10 Q2_K   2.6gb +4.2359 ppl (tiniest hallucinates most)
+.It
+  32 BF16   14gb  +0.0000 ppl (canonical but cpu/cuda only)
+.It
+   0 F32    27gb   9.0952 ppl (reference point)
+.It
+   2 Q4_0   3.9gb +0.3339 ppl (legacy)
+.It
+   3 Q4_1   4.3gb +0.4163 ppl (legacy)
+.It
+   9 Q5_1   5.1gb +0.1091 ppl (legacy)
+.It
+  12 Q3_K   alias for Q3_K_M
+.It
+  15 Q4_K   alias for Q4_K_M
+.It
+  17 Q5_K   alias for Q5_K_M
+.It
+COPY Only copy tensors, no quantizing.
+.El
+.Sh SEE ALSO
+.Xr llamafile 1 ,
+.Xr llamafile-imatrix 1 ,
+.Xr llamafile-perplexity 1 ,
+.Xr llava-quantize 1 ,
+.Xr zipalign 1 ,
+.Xr unzip 1
@@ -0,0 +1,83 @@
+[4mLLAMAFILE-QUANTIZE[24m(1)       General Commands Manual      [4mLLAMAFILE-QUANTIZE[24m(1)
+
+[1mNAME[0m
+       llamafile-quantize — large language model quantizer
+
+[1mSYNOPSIS[0m
+       [1mllamafile-quantize  [22m[flags...]  [4mmodel-f32.gguf[24m  [[4mmodel-quant.gguf[24m] [4mtype[0m
+                          [[4mnthreads[24m]
+
+[1mDESCRIPTION[0m
+       [1mllamafile-quantize [22mconverts  large  language  model  weights  from  the
+       float32  or float16 formats into smaller data types from 2 to 8 bits in
+       size.
+
+[1mOPTIONS[0m
+       The following flags are available:
+
+       [1m--allow-requantize[0m
+               Allows requantizing tensors that have already  been  quantized.
+               Warning:  This can severely reduce quality compared to quantiz‐
+               ing from 16bit or 32bit
+
+       [1m--leave-output-tensor[0m
+               Will leave output.weight un(re)quantized. Increases model  size
+               but may also increase quality, especially when requantizing
+
+       [1m--pure  [22mDisable  k-quant  mixtures and quantize all tensors to the same
+               type
+
+[1mARGUMENTS[0m
+       The following positional arguments are accepted:
+
+       [4mmodel-f32.gguf[0m
+               Is the input file, which contains the unquantized model weights
+               in either the float32 or float16 format.
+
+       [4mmodel-quant.gguf[0m
+               Is the output file, which will contain quantized weights in the
+               desired format. If this path isn't specified, it'll default  to
+               [inp path]/ggml-model-[ftype].gguf.
+
+       [4mtype[24m    Is the desired quantization format, which may be the integer id
+               of  a supported quantization type, or its name. See the quanti‐
+               zation types section below for acceptable formats.
+
+       [4mnthreads[0m
+               Number of threads to use during computation (default: nproc/2)
+
+[1mQUANTIZATION TYPES[0m
+       The following quantization types are available. This table shows the ID
+       of the quantization format, its name, the file size of 7B model weights
+       that use it, and finally the amount of quality badness it introduces as
+       measured by the llamafile-perplexity tool averaged over 128 chunks with
+       the TinyLLaMA 1.1B v1.0 Chat model. Rows are ordered in accordance with
+       how recommended the quantization format is for general usage.
+
+       [1m-     [22m18 Q6_K   5.6gb +0.0446 ppl (q6 kawrakow)
+       [1m-      [22m7 Q8_0   7.2gb +0.0022 ppl (q8 gerganov)
+       [1m-      [22m1 F16    14gb  +0.0000 ppl (best but biggest)
+       [1m-      [22m8 Q5_0   4.7gb +0.0817 ppl (q5 gerganov zero)
+       [1m-     [22m17 Q5_K_M 4.8gb +0.0836 ppl (q5 kawrakow medium)
+       [1m-     [22m16 Q5_K_S 4.7gb +0.1049 ppl (q5 kawrakow small)
+       [1m-     [22m15 Q4_K_M 4.1gb +0.3132 ppl (q4 kawrakow medium)
+       [1m-     [22m14 Q4_K_S 3.9gb +0.3408 ppl (q4 kawrakow small)
+       [1m-     [22m13 Q3_K_L 3.6gb +0.5736 ppl (q3 kawrakow large)
+       [1m-     [22m12 Q3_K_M 3.3gb +0.7612 ppl (q3 kawrakow medium)
+       [1m-     [22m11 Q3_K_S 3.0gb +1.3834 ppl (q3 kawrakow small)
+       [1m-     [22m10 Q2_K   2.6gb +4.2359 ppl (tiniest hallucinates most)
+       [1m-     [22m32 BF16   14gb  +0.0000 ppl (canonical but cpu/cuda only)
+       [1m-      [22m0 F32    27gb   9.0952 ppl (reference point)
+       [1m-      [22m2 Q4_0   3.9gb +0.3339 ppl (legacy)
+       [1m-      [22m3 Q4_1   4.3gb +0.4163 ppl (legacy)
+       [1m-      [22m9 Q5_1   5.1gb +0.1091 ppl (legacy)
+       [1m-     [22m12 Q3_K   alias for Q3_K_M
+       [1m-     [22m15 Q4_K   alias for Q4_K_M
+       [1m-     [22m17 Q5_K   alias for Q5_K_M
+       [1m-   [22mCOPY Only copy tensors, no quantizing.
+
+[1mSEE ALSO[0m
+       [4mllamafile[24m(1),      [4mllamafile-imatrix[24m(1),       [4mllamafile-perplexity[24m(1),
+       [4mllava-quantize[24m(1), [4mzipalign[24m(1), [4munzip[24m(1)
+
+Llamafile Manual               December 5, 2023          [4mLLAMAFILE-QUANTIZE[24m(1)
@@ -0,0 +1,44 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+#include "llama.cpp/llama.h"
+#include "llamafile/version.h"
+#include "llama.cpp/embedr/sqlite3.h"
+#include "llama.cpp/embedr/sqlite-vec.h"
+#include "llama.cpp/embedr/shell.h"
+#include "string.h"
+int main(int argc, char ** argv) {
+    int rc;
+    sqlite3* db;
+    sqlite3_stmt* stmt;
+    rc = sqlite3_auto_extension((void (*)())sqlite3_vec_init);
+
+    if(argc > 1 &&  (strcmp(argv[1], "sh") == 0)) {
+      return mn(argc, argv);
+    }
+    printf("%d\n", argc);
+    printf("llamafile-embed %s, SQLite %s, sqlite-vec=%s, %d\n", LLAMAFILE_VERSION_STRING, sqlite3_version, SQLITE_VEC_VERSION, LLAMA_FTYPE_MOSTLY_Q4_1);
+
+    rc = sqlite3_open(":memory:", &db);
+    if(rc != SQLITE_OK) {
+      printf("x\n");
+      return 1;
+    }
+
+    rc = sqlite3_prepare_v2(db, "select vec_version()", -1, &stmt, NULL);
+    if(rc != SQLITE_OK) {
+      printf("a\n");
+      return 1;
+    }
+    rc = sqlite3_step(stmt);
+    if(rc != SQLITE_ROW) {
+      printf("b\n");
+      sqlite3_finalize(stmt);
+      return 1;
+    }
+    printf("x=%s\n", sqlite3_column_text(stmt, 0));
+
+    sqlite3_finalize(stmt);
+
+
+    return 0;
+}