Skip to content

Commit edf46a3

Browse files
Merge branch 'ggerganov:master' into master
2 parents 1582e4e + 5ed26e1 commit edf46a3

File tree

5 files changed

+138
-12
lines changed

5 files changed

+138
-12
lines changed

.devops/nix/package.nix

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,22 @@
1313
cudaPackages,
1414
darwin,
1515
rocmPackages,
16+
vulkan-headers,
17+
vulkan-loader,
1618
clblast,
1719
useBlas ? builtins.all (x: !x) [
1820
useCuda
1921
useMetalKit
2022
useOpenCL
2123
useRocm
24+
useVulkan
2225
],
2326
useCuda ? config.cudaSupport,
2427
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
2528
useMpi ? false, # Increases the runtime closure size by ~700M
2629
useOpenCL ? false,
2730
useRocm ? config.rocmSupport,
31+
useVulkan ? false,
2832
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
2933
}@inputs:
3034

@@ -48,7 +52,8 @@ let
4852
++ lib.optionals useMetalKit [ "MetalKit" ]
4953
++ lib.optionals useMpi [ "MPI" ]
5054
++ lib.optionals useOpenCL [ "OpenCL" ]
51-
++ lib.optionals useRocm [ "ROCm" ];
55+
++ lib.optionals useRocm [ "ROCm" ]
56+
++ lib.optionals useVulkan [ "Vulkan" ];
5257

5358
pnameSuffix =
5459
strings.optionalString (suffices != [ ])
@@ -108,6 +113,11 @@ let
108113
hipblas
109114
rocblas
110115
];
116+
117+
vulkanBuildInputs = [
118+
vulkan-headers
119+
vulkan-loader
120+
];
111121
in
112122

113123
effectiveStdenv.mkDerivation (
@@ -164,7 +174,8 @@ effectiveStdenv.mkDerivation (
164174
++ optionals useCuda cudaBuildInputs
165175
++ optionals useMpi [ mpi ]
166176
++ optionals useOpenCL [ clblast ]
167-
++ optionals useRocm rocmBuildInputs;
177+
++ optionals useRocm rocmBuildInputs
178+
++ optionals useVulkan vulkanBuildInputs;
168179

169180
cmakeFlags =
170181
[
@@ -178,6 +189,7 @@ effectiveStdenv.mkDerivation (
178189
(cmakeBool "LLAMA_HIPBLAS" useRocm)
179190
(cmakeBool "LLAMA_METAL" useMetalKit)
180191
(cmakeBool "LLAMA_MPI" useMpi)
192+
(cmakeBool "LLAMA_VULKAN" useVulkan)
181193
]
182194
++ optionals useCuda [
183195
(
@@ -218,6 +230,7 @@ effectiveStdenv.mkDerivation (
218230
useMpi
219231
useOpenCL
220232
useRocm
233+
useVulkan
221234
;
222235

223236
shell = mkShell {
@@ -242,11 +255,11 @@ effectiveStdenv.mkDerivation (
242255
# Configurations we don't want even the CI to evaluate. Results in the
243256
# "unsupported platform" messages. This is mostly a no-op, because
244257
# cudaPackages would've refused to evaluate anyway.
245-
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
258+
badPlatforms = optionals (useCuda || useOpenCL || useVulkan) lib.platforms.darwin;
246259

247260
# Configurations that are known to result in build failures. Can be
248261
# overridden by importing Nixpkgs with `allowBroken = true`.
249-
broken = (useMetalKit && !effectiveStdenv.isDarwin);
262+
broken = (useMetalKit && !effectiveStdenv.isDarwin) || (useVulkan && effectiveStdenv.isDarwin);
250263

251264
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
252265
homepage = "https://github.com/ggerganov/llama.cpp/";

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ if (NOT MSVC)
7979
endif()
8080

8181
if (WIN32)
82-
option(LLAMA_WIN_VER "llama: Windows Version" 0x602)
82+
set(LLAMA_WIN_VER "0x602" CACHE STRING "llama: Windows Version")
8383
endif()
8484

8585
# 3rd party libs

Makefile

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ MK_NVCCFLAGS += -O3
109109
else
110110
MK_CFLAGS += -O3
111111
MK_CXXFLAGS += -O3
112+
MK_NVCCFLAGS += -O3
112113
endif
113114

114115
# clock_gettime came in POSIX.1b (1993)
@@ -365,7 +366,7 @@ ifdef LLAMA_CUBLAS
365366
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
366367
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
367368
OBJS += ggml-cuda.o
368-
MK_NVCCFLAGS = -use_fast_math
369+
MK_NVCCFLAGS += -use_fast_math
369370
ifndef JETSON_EOL_MODULE_DETECT
370371
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
371372
endif # JETSON_EOL_MODULE_DETECT
@@ -552,8 +553,11 @@ $(info I CFLAGS: $(CFLAGS))
552553
$(info I CXXFLAGS: $(CXXFLAGS))
553554
$(info I NVCCFLAGS: $(NVCCFLAGS))
554555
$(info I LDFLAGS: $(LDFLAGS))
555-
$(info I CC: $(shell $(CC) --version | head -n 1))
556-
$(info I CXX: $(shell $(CXX) --version | head -n 1))
556+
$(info I CC: $(shell $(CC) --version | head -n 1))
557+
$(info I CXX: $(shell $(CXX) --version | head -n 1))
558+
ifdef LLAMA_CUBLAS
559+
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
560+
endif # LLAMA_CUBLAS
557561
$(info )
558562

559563
#

examples/imatrix/imatrix.cpp

Lines changed: 112 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ class IMatrixCollector {
3636
void set_parameters(StatParams&& params) { m_params = std::move(params); }
3737
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
3838
void save_imatrix() const;
39+
bool load_imatrix(const char * file_name, bool add);
40+
static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
3941
private:
4042
std::unordered_map<std::string, Stats> m_stats;
4143
StatParams m_params;
@@ -189,6 +191,57 @@ void IMatrixCollector::save_imatrix(const char * fname) const {
189191
}
190192
}
191193

194+
bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
195+
std::ifstream in(imatrix_file, std::ios::binary);
196+
if (!in) {
197+
printf("%s: failed to open %s\n",__func__,imatrix_file);
198+
return false;
199+
}
200+
int n_entries;
201+
in.read((char*)&n_entries, sizeof(n_entries));
202+
if (in.fail() || n_entries < 1) {
203+
printf("%s: no data in file %s\n", __func__, imatrix_file);
204+
return false;
205+
}
206+
for (int i = 0; i < n_entries; ++i) {
207+
int len; in.read((char *)&len, sizeof(len));
208+
std::vector<char> name_as_vec(len+1);
209+
in.read((char *)name_as_vec.data(), len);
210+
if (in.fail()) {
211+
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
212+
return false;
213+
}
214+
name_as_vec[len] = 0;
215+
std::string name{name_as_vec.data()};
216+
auto& e = imatrix_data[std::move(name)];
217+
int ncall;
218+
in.read((char*)&ncall, sizeof(ncall));
219+
int nval;
220+
in.read((char *)&nval, sizeof(nval));
221+
if (in.fail() || nval < 1) {
222+
printf("%s: failed reading number of values for entry %d\n",__func__,i);
223+
imatrix_data = {};
224+
return false;
225+
}
226+
e.values.resize(nval);
227+
in.read((char*)e.values.data(), nval*sizeof(float));
228+
if (in.fail()) {
229+
printf("%s: failed reading data for entry %d\n",__func__,i);
230+
imatrix_data = {};
231+
return false;
232+
}
233+
e.ncall = ncall;
234+
}
235+
return true;
236+
}
237+
238+
bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
239+
if (!add) {
240+
m_stats.clear();
241+
}
242+
return load_imatrix(file_name, m_stats);
243+
}
244+
192245
static IMatrixCollector g_collector;
193246

194247
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -269,7 +322,7 @@ static void process_logits(
269322
}
270323
}
271324

272-
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {
325+
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
273326

274327
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
275328
const int n_ctx = llama_n_ctx(ctx);
@@ -282,6 +335,15 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
282335
auto tim2 = std::chrono::high_resolution_clock::now();
283336
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
284337

338+
if (from_chunk > 0) {
339+
if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
340+
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
341+
return false;
342+
}
343+
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
344+
tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
345+
}
346+
285347
if (int(tokens.size()) < 2*n_ctx) {
286348
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
287349
n_ctx);
@@ -402,7 +464,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
402464
int main(int argc, char ** argv) {
403465

404466
StatParams sparams;
467+
std::string prev_result_file;
468+
std::string combine_files;
405469
bool compute_ppl = true;
470+
int from_chunk = 0;
406471
std::vector<char*> args;
407472
args.push_back(argv[0]);
408473
int iarg = 1;
@@ -423,6 +488,13 @@ int main(int argc, char ** argv) {
423488
compute_ppl = false;
424489
} else if (arg == "--keep-imatrix") {
425490
sparams.keep_every = std::stoi(argv[++iarg]);
491+
} else if (arg == "--continue-from") {
492+
prev_result_file = argv[++iarg];
493+
} else if (arg == "--combine") {
494+
combine_files = argv[++iarg];
495+
}
496+
else if (arg == "--from-chunk") {
497+
from_chunk = std::stoi(argv[++iarg]);
426498
} else {
427499
args.push_back(argv[iarg]);
428500
}
@@ -436,14 +508,50 @@ int main(int argc, char ** argv) {
436508
}
437509
}
438510

511+
g_collector.set_parameters(std::move(sparams));
512+
513+
if (!combine_files.empty()) {
514+
std::vector<std::string> files;
515+
size_t pos = 0;
516+
while (true) {
517+
auto new_pos = combine_files.find(',', pos);
518+
if (new_pos != std::string::npos) {
519+
files.emplace_back(combine_files.substr(pos, new_pos - pos));
520+
pos = new_pos + 1;
521+
} else {
522+
files.emplace_back(combine_files.substr(pos));
523+
break;
524+
}
525+
}
526+
if (files.size() < 2) {
527+
fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
528+
return 1;
529+
}
530+
printf("Combining the following %d files\n", int(files.size()));
531+
for (auto& file : files) {
532+
printf(" %s\n", file.c_str());
533+
if (!g_collector.load_imatrix(file.c_str(), true)) {
534+
fprintf(stderr, "Failed to load %s\n", file.c_str());
535+
return 1;
536+
}
537+
}
538+
g_collector.save_imatrix();
539+
return 0;
540+
}
541+
542+
if (!prev_result_file.empty()) {
543+
if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
544+
fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
545+
return 1;
546+
}
547+
}
548+
439549
gpt_params params;
440550
params.n_batch = 512;
441551
if (!gpt_params_parse(args.size(), args.data(), params)) {
442552
return 1;
443553
}
444554

445-
g_collector.set_parameters(std::move(sparams));
446-
447555
params.logits_all = true;
448556
params.n_batch = std::min(params.n_batch, params.n_ctx);
449557

@@ -495,7 +603,7 @@ int main(int argc, char ** argv) {
495603
fprintf(stderr, "%s\n", get_system_info(params).c_str());
496604
}
497605

498-
bool OK = compute_imatrix(ctx, params, compute_ppl);
606+
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
499607
if (!OK) {
500608
return 1;
501609
}

flake.nix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@
157157

158158
mpi-cpu = config.packages.default.override { useMpi = true; };
159159
mpi-cuda = config.packages.default.override { useMpi = true; };
160+
vulkan = config.packages.default.override { useVulkan = true; };
160161
}
161162
// lib.optionalAttrs (system == "x86_64-linux") {
162163
rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;

0 commit comments

Comments
 (0)