Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions llama.cpp/llava/llava-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
#include "llama.cpp/llama.h"
#include "llama.cpp/base64.h"
#include "llamafile/version.h"
#include "llamafile/llamafile.h"

#include <cosmo.h>
#include <cstdio>
#include <cstdlib>
#include <vector>
Expand Down Expand Up @@ -63,6 +65,68 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
static const char* IMG_BASE64_TAG_END = "\">";

// Auto-detect mmproj file from the main model if it's a ZIP/llamafile
static std::string auto_detect_mmproj(const std::string& model_path) {
// Try common mmproj filenames
std::vector<std::string> common_names = {
"mmproj-model-f16.gguf",
"mmproj-model-q4_0.gguf",
"mmproj-model-q4_1.gguf",
"mmproj.gguf",
"vision_encoder.gguf",
"clip.gguf",
"visual.gguf"
};

LOG_TEE("Auto-detecting mmproj file (model_path='%s')...\n", model_path.c_str());

// First, check if model_path is a ZIP file (contains @) or a regular file
bool model_is_zip = false;
if (!model_path.empty() && model_path != DEFAULT_MODEL_PATH) {
// Check if it's a ZIP by looking for .gguf extension or seeing if we can open it
struct llamafile* test = llamafile_open_gguf(model_path.c_str(), "rb");
if (test) {
// Check if it's actually a ZIP by looking at the opened path
const char* opened_name = llamafile_name(test);
model_is_zip = opened_name && strchr(opened_name, '@') != NULL;
llamafile_close(test);
}

// If model is a ZIP/llamafile, try to find mmproj inside it
if (model_is_zip || model_path.find(".llamafile") != std::string::npos) {
std::string base_path = model_path;
// Remove @ suffix if present
size_t at_pos = base_path.find('@');
if (at_pos != std::string::npos) {
base_path = base_path.substr(0, at_pos);
}

for (const auto& name : common_names) {
std::string test_path = base_path + "@" + name;
struct llamafile* f = llamafile_open_gguf(test_path.c_str(), "rb");
if (f) {
llamafile_close(f);
LOG_TEE("Auto-detected mmproj file: %s\n", test_path.c_str());
return test_path;
}
}
}
}

// Try in the executable itself (when running as ./llava.llamafile)
for (const auto& name : common_names) {
struct llamafile* f = llamafile_open_gguf(name.c_str(), "rb");
if (f) {
llamafile_close(f);
LOG_TEE("Auto-detected mmproj file: %s (embedded in executable)\n", name.c_str());
return name;
}
}

LOG_TEE("No mmproj file found via auto-detection\n");
return "";
}

static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
Expand Down Expand Up @@ -132,6 +196,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {

LOG_TEE("\n example usage:\n");
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE("\n %s -m <llava.llamafile> --image <path/to/an/image.jpg> [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE("\n note: --mmproj is optional when using a llamafile containing multiple GGUF files\n");
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}

Expand Down Expand Up @@ -307,6 +373,23 @@ int llava_cli(int argc, char ** argv, gpt_params & params) {
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS

// Handle running as llamafile executable
if (params.model.empty()) {
// Check if we're running as a llamafile with embedded model
const char* prog = GetProgramExecutableName();
struct llamafile* test = llamafile_open_gguf(prog, "rb");
if (test) {
llamafile_close(test);
params.model = prog;
LOG_TEE("Running as llamafile, using embedded model: %s\n", prog);
}
}

// Auto-detect mmproj if not provided
if (params.mmproj.empty()) {
params.mmproj = auto_detect_mmproj(params.model);
}

if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv, {});
return 1;
Expand Down
14 changes: 10 additions & 4 deletions llamafile/llamafile.c
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,12 @@ static struct llamafile *llamafile_open_zip(const char *prog, const char *fname,
goto Invalid;
}
if (found != 1) {
// TODO: Support opening LLaVA llamafiles.
fprintf(stderr, "%s: error: multiple %s files found in zip archive\n", prog,
fname ? fname : ".gguf");
goto Invalid;
// Multiple GGUF files found - this is OK for LLaVA models
// Just pick the first one found and log a message
fprintf(stderr, "%s: note: multiple GGUF files found in ZIP\n", prog);
if (!fname) {
fprintf(stderr, "%s: selecting '%s' (use @filename to specify)\n", prog, zip_name);
}
}
strlcat(file->fname, "@", PATH_MAX);
strlcat(file->fname, zip_name, PATH_MAX);
Expand Down Expand Up @@ -398,6 +400,10 @@ static void llamafile_close_impl(struct llamafile *file) {
free(file);
}

const char *llamafile_name(struct llamafile *file) {
return file ? file->fname : NULL;
}

void llamafile_ref(struct llamafile *file) {
atomic_fetch_add(&file->refs, 1);
}
Expand Down
1 change: 1 addition & 0 deletions llamafile/llamafile.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ bool llamafile_eof(struct llamafile *file);
FILE *llamafile_fp(struct llamafile *);
void llamafile_ref(struct llamafile *);
void llamafile_unref(struct llamafile *);
const char *llamafile_name(struct llamafile *);
char *llamafile_get_prompt(void);

void llamafile_govern(void);
Expand Down
83 changes: 83 additions & 0 deletions package_llava.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/bin/sh
# Script to package LLaVA models into a single llamafile
# This demonstrates the new multi-GGUF support

set -e

usage() {
cat << EOF
Usage: $0 <language_model.gguf> <vision_encoder.gguf> <output.llamafile>

Package a LLaVA model (language model + vision encoder) into a single llamafile.

Arguments:
language_model.gguf Path to the language model GGUF file (e.g., llava-v1.5-7b-q4.gguf)
vision_encoder.gguf Path to the vision encoder GGUF file (e.g., mmproj-model-f16.gguf)
output.llamafile Output llamafile name

Example:
$0 llava-v1.5-7b-q4.gguf mmproj-model-f16.gguf llava-v1.5-7b.llamafile

The resulting llamafile can be used without specifying --mmproj:
./llava-v1.5-7b.llamafile --image photo.jpg -p "What's in this image?"
EOF
exit 1
}

if [ $# -ne 3 ]; then
usage
fi

LANGUAGE_MODEL="$1"
VISION_ENCODER="$2"
OUTPUT_FILE="$3"

# Check if input files exist
if [ ! -f "$LANGUAGE_MODEL" ]; then
echo "Error: Language model file not found: $LANGUAGE_MODEL"
exit 1
fi

if [ ! -f "$VISION_ENCODER" ]; then
echo "Error: Vision encoder file not found: $VISION_ENCODER"
exit 1
fi

# Check if zipalign exists
if ! command -v zipalign >/dev/null 2>&1; then
echo "Error: zipalign not found. Please build it first."
echo "Run: make -j8"
exit 1
fi

# Check if llamafile binary exists
if [ ! -f "o/$(uname -m)/bin/llamafile" ]; then
echo "Error: llamafile binary not found. Please build it first."
echo "Run: make -j8"
exit 1
fi

echo "Packaging LLaVA model..."
echo " Language model: $LANGUAGE_MODEL"
echo " Vision encoder: $VISION_ENCODER"
echo " Output file: $OUTPUT_FILE"

# Copy the llamafile binary
cp "o/$(uname -m)/bin/llamafile" "$OUTPUT_FILE"

# Use zipalign to add both GGUF files
echo "Adding GGUF files to llamafile..."
./o/$(uname -m)/bin/zipalign -j0 "$OUTPUT_FILE" \
"$LANGUAGE_MODEL" \
"$VISION_ENCODER"

# Make it executable
chmod +x "$OUTPUT_FILE"

echo ""
echo "Successfully created $OUTPUT_FILE!"
echo ""
echo "You can now use it without specifying --mmproj:"
echo " ./$OUTPUT_FILE --image photo.jpg -p \"What's in this image?\""
echo ""
echo "The vision encoder will be auto-detected from the embedded files."