sebastian-software
diff --git a/‎packages/node-mlx/native/src/binding.cc‎
Lines changed: 83 additions & 0 deletions b/‎packages/node-mlx/native/src/binding.cc‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎packages/node-mlx/src/cli.ts‎
Lines changed: 68 additions & 9 deletions b/‎packages/node-mlx/src/cli.ts‎
Lines changed: 68 additions & 9 deletions
diff --git a/‎packages/node-mlx/src/index.ts‎
Lines changed: 41 additions & 0 deletions b/‎packages/node-mlx/src/index.ts‎
Lines changed: 41 additions & 0 deletions
@@ -14,11 +14,15 @@ typedef bool (*IsAvailableFn)(void);
 typedef char* (*GetVersionFn)(void);
 typedef bool (*SetMetallibPathFn)(const char*);
 typedef char* (*GenerateStreamingFn)(int32_t, const char*, int32_t, float, float);
+typedef char* (*GenerateWithImageFn)(int32_t, const char*, const char*, int32_t, float, float);
+typedef bool (*IsVLMFn)(int32_t);
 
 static LoadModelFn fn_load_model = nullptr;
 static UnloadModelFn fn_unload_model = nullptr;
 static GenerateFn fn_generate = nullptr;
 static GenerateStreamingFn fn_generate_streaming = nullptr;
+static GenerateWithImageFn fn_generate_with_image = nullptr;
+static IsVLMFn fn_is_vlm = nullptr;
 static FreeStringFn fn_free_string = nullptr;
 static IsAvailableFn fn_is_available = nullptr;
 static GetVersionFn fn_get_version = nullptr;
@@ -58,6 +62,8 @@ Napi::Value Initialize(const Napi::CallbackInfo& info) {
   fn_get_version = (GetVersionFn)dlsym(dylib_handle, "node_mlx_version");
   fn_set_metallib_path = (SetMetallibPathFn)dlsym(dylib_handle, "node_mlx_set_metallib_path");
   fn_generate_streaming = (GenerateStreamingFn)dlsym(dylib_handle, "node_mlx_generate_streaming");
+  fn_generate_with_image = (GenerateWithImageFn)dlsym(dylib_handle, "node_mlx_generate_with_image");
+  fn_is_vlm = (IsVLMFn)dlsym(dylib_handle, "node_mlx_is_vlm");
 
   if (!fn_load_model || !fn_generate || !fn_free_string) {
     std::string missing;
@@ -236,6 +242,81 @@ Napi::Value GenerateStreaming(const Napi::CallbackInfo& info) {
   return Napi::String::New(env, jsonStr);
 }
 
+// Generate text with image (VLM) - tokens are written directly to stdout
+Napi::Value GenerateWithImage(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+
+  if (!fn_generate_with_image) {
+    Napi::Error::New(env, "VLM generation not available").ThrowAsJavaScriptException();
+    return env.Null();
+  }
+
+  if (info.Length() < 3 || !info[0].IsNumber() || !info[1].IsString() || !info[2].IsString()) {
+    Napi::TypeError::New(env, "Usage: generateWithImage(handle, prompt, imagePath, options?)").ThrowAsJavaScriptException();
+    return env.Null();
+  }
+
+  int32_t handle = info[0].As<Napi::Number>().Int32Value();
+  std::string prompt = info[1].As<Napi::String>().Utf8Value();
+  std::string imagePath = info[2].As<Napi::String>().Utf8Value();
+
+  // Default options
+  int32_t maxTokens = 256;
+  float temperature = 0.7f;
+  float topP = 0.9f;
+
+  // Parse options object if provided
+  if (info.Length() > 3 && info[3].IsObject()) {
+    Napi::Object options = info[3].As<Napi::Object>();
+
+    if (options.Has("maxTokens")) {
+      maxTokens = options.Get("maxTokens").As<Napi::Number>().Int32Value();
+    }
+    if (options.Has("temperature")) {
+      temperature = options.Get("temperature").As<Napi::Number>().FloatValue();
+    }
+    if (options.Has("topP")) {
+      topP = options.Get("topP").As<Napi::Number>().FloatValue();
+    }
+  }
+
+  // Flush stdout before calling streaming generate
+  fflush(stdout);
+
+  char* jsonResult = fn_generate_with_image(handle, prompt.c_str(), imagePath.c_str(), maxTokens, temperature, topP);
+
+  // Flush again after generation
+  fflush(stdout);
+
+  if (!jsonResult) {
+    Napi::Error::New(env, "Generate with image returned null").ThrowAsJavaScriptException();
+    return env.Null();
+  }
+
+  std::string jsonStr(jsonResult);
+  fn_free_string(jsonResult);
+
+  // Return the JSON string with stats
+  return Napi::String::New(env, jsonStr);
+}
+
+// Check if model is a VLM (Vision-Language Model)
+Napi::Value IsVLM(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+
+  if (!fn_is_vlm) {
+    return Napi::Boolean::New(env, false);
+  }
+
+  if (info.Length() < 1 || !info[0].IsNumber()) {
+    Napi::TypeError::New(env, "Model handle number required").ThrowAsJavaScriptException();
+    return Napi::Boolean::New(env, false);
+  }
+
+  int32_t handle = info[0].As<Napi::Number>().Int32Value();
+  return Napi::Boolean::New(env, fn_is_vlm(handle));
+}
+
 // Check if MLX is available
 Napi::Value IsAvailable(const Napi::CallbackInfo& info) {
   Napi::Env env = info.Env();
@@ -282,6 +363,8 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) {
   exports.Set("unloadModel", Napi::Function::New(env, UnloadModel));
   exports.Set("generate", Napi::Function::New(env, Generate));
   exports.Set("generateStreaming", Napi::Function::New(env, GenerateStreaming));
+  exports.Set("generateWithImage", Napi::Function::New(env, GenerateWithImage));
+  exports.Set("isVLM", Napi::Function::New(env, IsVLM));
   exports.Set("isAvailable", Napi::Function::New(env, IsAvailable));
   exports.Set("getVersion", Napi::Function::New(env, GetVersion));
 
 
@@ -54,11 +54,16 @@ function printHelp() {
   log(`  mlx                              Interactive chat`)
   log(`  mlx "prompt"                     One-shot generation`)
   log(`  mlx --model <name>               Use specific model`)
+  log(`  mlx --image <path>               Include image (VLM only)`)
   log(`  mlx --list                       List available models`)
   log(`  mlx --help                       Show this help`)
   log("")
+  log(`${colors.bold}Vision models (VLM):${colors.reset}`)
+  log(`  mlx --model gemma-3-4b --image photo.jpg "What's in this image?"`)
+  log("")
   log(`${colors.bold}Interactive commands:${colors.reset}`)
   log(`  /model <name>                    Switch model`)
+  log(`  /image <path>                    Set image for next prompt`)
   log(`  /temp <0-2>                      Set temperature`)
   log(`  /tokens <n>                      Set max tokens`)
   log(`  /clear                           Clear conversation`)
@@ -167,6 +172,7 @@ interface ChatState {
   modelName: string
   options: GenerationOptions
   history: Array<{ role: "user" | "assistant"; content: string }>
+  imagePath: string | null // For VLM image input
 }
 
 async function runInteractive(initialModel: string) {
@@ -178,7 +184,8 @@ async function runInteractive(initialModel: string) {
       temperature: 0.7,
       topP: 0.9
     },
-    history: []
+    history: [],
+    imagePath: null
   }
 
   // Load initial model
@@ -235,8 +242,16 @@ async function runInteractive(initialModel: string) {
       process.stdout.write(`${colors.magenta}AI:${colors.reset} `)
 
       try {
-        // Use streaming - tokens are written directly to stdout
-        const result = state.model.generateStreaming(fullPrompt, state.options)
+        let result
+
+        // Check if we have an image to send
+        if (state.imagePath && state.model.isVLM()) {
+          result = state.model.generateWithImage(fullPrompt, state.imagePath, state.options)
+          state.imagePath = null // Clear after use
+        } else {
+          // Use streaming - tokens are written directly to stdout
+          result = state.model.generateStreaming(fullPrompt, state.options)
+        }
 
         // Note: text already streamed, we only have stats
         log("")
@@ -374,21 +389,61 @@ async function handleCommand(input: string, state: ChatState, rl: readline.Inter
       printModels()
       break
 
+    case "image":
+    case "i":
+      if (!arg) {
+        if (state.imagePath) {
+          log(`${colors.dim}Current image: ${state.imagePath}${colors.reset}`)
+        } else {
+          log(`${colors.dim}No image set. Use /image <path> to set one.${colors.reset}`)
+        }
+      } else {
+        // Check if file exists
+        const fs = await import("node:fs")
+        if (!fs.existsSync(arg)) {
+          error(`Image not found: ${arg}`)
+        } else if (!state.model?.isVLM()) {
+          error(`Current model doesn't support images. Use a VLM like gemma-3-4b.`)
+        } else {
+          state.imagePath = arg
+          log(`${colors.green}✓${colors.reset} Image set: ${arg}`)
+          log(`${colors.dim}The next prompt will include this image.${colors.reset}`)
+        }
+      }
+      break
+
     default:
       error(`Unknown command: /${cmd}. Type /help for commands.`)
   }
 }
 
-async function runOneShot(modelName: string, prompt: string, options: GenerationOptions) {
+async function runOneShot(
+  modelName: string,
+  prompt: string,
+  imagePath: string | null,
+  options: GenerationOptions
+) {
   log(`${colors.dim}Loading ${modelName}...${colors.reset}`)
 
   const modelId = resolveModel(modelName)
 
   try {
     const model = loadModel(modelId)
 
-    // Use streaming - tokens are written directly to stdout
-    const result = model.generateStreaming(prompt, options)
+    let result
+
+    // Check if we have an image to process
+    if (imagePath) {
+      if (!model.isVLM()) {
+        error(`Model ${modelName} doesn't support images. Use a VLM like gemma-3-4b.`)
+        model.unload()
+        process.exit(1)
+      }
+      result = model.generateWithImage(prompt, imagePath, options)
+    } else {
+      // Use streaming - tokens are written directly to stdout
+      result = model.generateStreaming(prompt, options)
+    }
 
     // Add newline after streamed output
     log("")
@@ -407,12 +462,14 @@ async function runOneShot(modelName: string, prompt: string, options: Generation
 function parseArgs(): {
   model: string
   prompt: string | null
+  imagePath: string | null
   options: GenerationOptions
   command: "chat" | "oneshot" | "list" | "help" | "version"
 } {
   const args = process.argv.slice(2)
   let model = "qwen" // Default to Qwen (no auth required)
   let prompt: string | null = null
+  let imagePath: string | null = null
   const options: GenerationOptions = {
     maxTokens: 512,
     temperature: 0.7,
@@ -431,6 +488,8 @@ function parseArgs(): {
       command = "list"
     } else if (arg === "--model" || arg === "-m") {
       model = args[++i] || model
+    } else if (arg === "--image" || arg === "-i") {
+      imagePath = args[++i] || null
     } else if (arg === "--temp" || arg === "-t") {
       options.temperature = parseFloat(args[++i] || "0.7")
     } else if (arg === "--tokens" || arg === "-n") {
@@ -446,12 +505,12 @@ function parseArgs(): {
     }
   }
 
-  return { model, prompt, options, command }
+  return { model, prompt, imagePath, options, command }
 }
 
 // Main
 async function main() {
-  const { model, prompt, options, command } = parseArgs()
+  const { model, prompt, imagePath, options, command } = parseArgs()
 
   // Commands that don't need Apple Silicon
   switch (command) {
@@ -486,7 +545,7 @@ async function main() {
 
   switch (command) {
     case "oneshot":
-      await runOneShot(model, prompt!, options)
+      await runOneShot(model, prompt!, imagePath, options)
       break
 
     case "chat":
 
@@ -24,6 +24,13 @@ interface NativeBinding {
     prompt: string,
     options?: { maxTokens?: number; temperature?: number; topP?: number }
   ): string // Streams to stdout, returns JSON stats
+  generateWithImage(
+    handle: number,
+    prompt: string,
+    imagePath: string,
+    options?: { maxTokens?: number; temperature?: number; topP?: number }
+  ): string // VLM: Streams to stdout, returns JSON stats
+  isVLM(handle: number): boolean
   isAvailable(): boolean
   getVersion(): string
 }
@@ -155,6 +162,12 @@ export interface Model {
   /** Generate text with streaming - tokens are written directly to stdout */
   generateStreaming(prompt: string, options?: GenerationOptions): StreamingResult
 
+  /** Generate text from a prompt with an image (VLM only) */
+  generateWithImage(prompt: string, imagePath: string, options?: GenerationOptions): StreamingResult
+
+  /** Check if this model supports images (is a Vision-Language Model) */
+  isVLM(): boolean
+
   /** Unload the model from memory */
   unload(): void
 
@@ -302,6 +315,34 @@ export function loadModel(modelId: string): Model {
       }
     },
 
+    generateWithImage(
+      prompt: string,
+      imagePath: string,
+      options?: GenerationOptions
+    ): StreamingResult {
+      // VLM generation with image - tokens are written directly to stdout by Swift
+      const jsonStr = b.generateWithImage(handle, prompt, imagePath, {
+        maxTokens: options?.maxTokens ?? 256,
+        temperature: options?.temperature ?? 0.7,
+        topP: options?.topP ?? 0.9
+      })
+
+      const result = JSON.parse(jsonStr) as JSONGenerationResult
+
+      if (!result.success) {
+        throw new Error(result.error ?? "Generation failed")
+      }
+
+      return {
+        tokenCount: result.tokenCount ?? 0,
+        tokensPerSecond: result.tokensPerSecond ?? 0
+      }
+    },
+
+    isVLM(): boolean {
+      return b.isVLM(handle)
+    },
+
     unload(): void {
       b.unloadModel(handle)
     }