mozilla-ai · anivar · Aug 16, 2025 · Aug 16, 2025
diff --git a/llama.cpp/llava/llava-cli.cpp b/llama.cpp/llava/llava-cli.cpp
@@ -8,7 +8,9 @@
 #include "llama.cpp/llama.h"
 #include "llama.cpp/base64.h"
 #include "llamafile/version.h"
+#include "llamafile/llamafile.h"
 
+#include <cosmo.h>
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
@@ -63,6 +65,68 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
 static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
 static const char* IMG_BASE64_TAG_END = "\">";
 
+// Auto-detect mmproj file from the main model if it's a ZIP/llamafile
+static std::string auto_detect_mmproj(const std::string& model_path) {
+    // Try common mmproj filenames
+    std::vector<std::string> common_names = {
+        "mmproj-model-f16.gguf",
+        "mmproj-model-q4_0.gguf",
+        "mmproj-model-q4_1.gguf",
+        "mmproj.gguf", 
+        "vision_encoder.gguf",
+        "clip.gguf",
+        "visual.gguf"
+    };
+
+    LOG_TEE("Auto-detecting mmproj file (model_path='%s')...\n", model_path.c_str());
+
+    // First, check if model_path is a ZIP file (contains @) or a regular file
+    bool model_is_zip = false;
+    if (!model_path.empty() && model_path != DEFAULT_MODEL_PATH) {
+        // Check if it's a ZIP by looking for .gguf extension or seeing if we can open it
+        struct llamafile* test = llamafile_open_gguf(model_path.c_str(), "rb");
+        if (test) {
+            // Check if it's actually a ZIP by looking at the opened path
+            const char* opened_name = llamafile_name(test);
+            model_is_zip = opened_name && strchr(opened_name, '@') != NULL;
+            llamafile_close(test);
+        }
+
+        // If model is a ZIP/llamafile, try to find mmproj inside it
+        if (model_is_zip || model_path.find(".llamafile") != std::string::npos) {
+            std::string base_path = model_path;
+            // Remove @ suffix if present
+            size_t at_pos = base_path.find('@');
+            if (at_pos != std::string::npos) {
+                base_path = base_path.substr(0, at_pos);
+            }
+
+            for (const auto& name : common_names) {
+                std::string test_path = base_path + "@" + name;
+                struct llamafile* f = llamafile_open_gguf(test_path.c_str(), "rb");
+                if (f) {
+                    llamafile_close(f);
+                    LOG_TEE("Auto-detected mmproj file: %s\n", test_path.c_str());
+                    return test_path;
+                }
+            }
+        }
+    }
+
+    // Try in the executable itself (when running as ./llava.llamafile)
+    for (const auto& name : common_names) {
+        struct llamafile* f = llamafile_open_gguf(name.c_str(), "rb");
+        if (f) {
+            llamafile_close(f);
+            LOG_TEE("Auto-detected mmproj file: %s (embedded in executable)\n", name.c_str());
+            return name;
+        }
+    }
+
+    LOG_TEE("No mmproj file found via auto-detection\n");
+    return "";
+}
+
 static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
     begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
     end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
@@ -132,6 +196,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
 
     LOG_TEE("\n example usage:\n");
     LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("\n     %s -m <llava.llamafile> --image <path/to/an/image.jpg> [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("\n note: --mmproj is optional when using a llamafile containing multiple GGUF files\n");
     LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 
@@ -307,6 +373,23 @@ int llava_cli(int argc, char ** argv, gpt_params & params) {
     llama_log_set(llama_log_callback_logTee, nullptr);
 #endif // LOG_DISABLE_LOGS
 
+    // Handle running as llamafile executable
+    if (params.model.empty()) {
+        // Check if we're running as a llamafile with embedded model
+        const char* prog = GetProgramExecutableName();
+        struct llamafile* test = llamafile_open_gguf(prog, "rb");
+        if (test) {
+            llamafile_close(test);
+            params.model = prog;
+            LOG_TEE("Running as llamafile, using embedded model: %s\n", prog);
+        }
+    }
+
+    // Auto-detect mmproj if not provided
+    if (params.mmproj.empty()) {
+        params.mmproj = auto_detect_mmproj(params.model);
+    }
+
     if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
         print_usage(argc, argv, {});
         return 1;

diff --git a/llamafile/llamafile.c b/llamafile/llamafile.c
@@ -177,10 +177,12 @@ static struct llamafile *llamafile_open_zip(const char *prog, const char *fname,
         goto Invalid;
     }
     if (found != 1) {
-        // TODO: Support opening LLaVA llamafiles.
-        fprintf(stderr, "%s: error: multiple %s files found in zip archive\n", prog,
-                fname ? fname : ".gguf");
-        goto Invalid;
+        // Multiple GGUF files found - this is OK for LLaVA models
+        // Just pick the first one found and log a message
+        fprintf(stderr, "%s: note: multiple GGUF files found in ZIP\n", prog);
+        if (!fname) {
+            fprintf(stderr, "%s: selecting '%s' (use @filename to specify)\n", prog, zip_name);
+        }
     }
     strlcat(file->fname, "@", PATH_MAX);
     strlcat(file->fname, zip_name, PATH_MAX);
@@ -398,6 +400,10 @@ static void llamafile_close_impl(struct llamafile *file) {
     free(file);
 }
 
+const char *llamafile_name(struct llamafile *file) {
+    return file ? file->fname : NULL;
+}
+
 void llamafile_ref(struct llamafile *file) {
     atomic_fetch_add(&file->refs, 1);
 }

diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h
@@ -79,6 +79,7 @@ bool llamafile_eof(struct llamafile *file);
 FILE *llamafile_fp(struct llamafile *);
 void llamafile_ref(struct llamafile *);
 void llamafile_unref(struct llamafile *);
+const char *llamafile_name(struct llamafile *);
 char *llamafile_get_prompt(void);
 
 void llamafile_govern(void);

diff --git a/package_llava.sh b/package_llava.sh
@@ -0,0 +1,83 @@
+#!/bin/sh
+# Script to package LLaVA models into a single llamafile
+# This demonstrates the new multi-GGUF support
+
+set -e
+
+usage() {
+    cat << EOF
+Usage: $0 <language_model.gguf> <vision_encoder.gguf> <output.llamafile>
+
+Package a LLaVA model (language model + vision encoder) into a single llamafile.
+
+Arguments:
+  language_model.gguf   Path to the language model GGUF file (e.g., llava-v1.5-7b-q4.gguf)
+  vision_encoder.gguf   Path to the vision encoder GGUF file (e.g., mmproj-model-f16.gguf)
+  output.llamafile      Output llamafile name
+
+Example:
+  $0 llava-v1.5-7b-q4.gguf mmproj-model-f16.gguf llava-v1.5-7b.llamafile
+
+The resulting llamafile can be used without specifying --mmproj:
+  ./llava-v1.5-7b.llamafile --image photo.jpg -p "What's in this image?"
+EOF
+    exit 1
+}
+
+if [ $# -ne 3 ]; then
+    usage
+fi
+
+LANGUAGE_MODEL="$1"
+VISION_ENCODER="$2"
+OUTPUT_FILE="$3"
+
+# Check if input files exist
+if [ ! -f "$LANGUAGE_MODEL" ]; then
+    echo "Error: Language model file not found: $LANGUAGE_MODEL"
+    exit 1
+fi
+
+if [ ! -f "$VISION_ENCODER" ]; then
+    echo "Error: Vision encoder file not found: $VISION_ENCODER"
+    exit 1
+fi
+
+# Check if zipalign exists
+if ! command -v zipalign >/dev/null 2>&1; then
+    echo "Error: zipalign not found. Please build it first."
+    echo "Run: make -j8"
+    exit 1
+fi
+
+# Check if llamafile binary exists
+if [ ! -f "o/$(uname -m)/bin/llamafile" ]; then
+    echo "Error: llamafile binary not found. Please build it first."
+    echo "Run: make -j8"
+    exit 1
+fi
+
+echo "Packaging LLaVA model..."
+echo "  Language model: $LANGUAGE_MODEL"
+echo "  Vision encoder: $VISION_ENCODER"
+echo "  Output file: $OUTPUT_FILE"
+
+# Copy the llamafile binary
+cp "o/$(uname -m)/bin/llamafile" "$OUTPUT_FILE"
+
+# Use zipalign to add both GGUF files
+echo "Adding GGUF files to llamafile..."
+./o/$(uname -m)/bin/zipalign -j0 "$OUTPUT_FILE" \
+    "$LANGUAGE_MODEL" \
+    "$VISION_ENCODER"
+
+# Make it executable
+chmod +x "$OUTPUT_FILE"
+
+echo ""
+echo "Successfully created $OUTPUT_FILE!"
+echo ""
+echo "You can now use it without specifying --mmproj:"
+echo "  ./$OUTPUT_FILE --image photo.jpg -p \"What's in this image?\""
+echo ""
+echo "The vision encoder will be auto-detected from the embedded files."