feat: add processor example (#194)

grorge123 · web-flow · commit 1426b1015946 · 2025-06-21T19:05:17.000+08:00
* feat: add processor example

* refactor: Erase redundant code
diff --git a/wasmedge-mlx/vlm/Cargo.toml b/wasmedge-mlx/vlm/Cargo.toml
@@ -1,8 +1,9 @@
 [package]
 name = "wasmedge-vlm"
 version = "0.1.0"
-edition = "2024"
+edition = "2021"
 
 [dependencies]
 serde_json = "1.0"
 wasmedge-wasi-nn = "0.8.0"
+rust_processor = { git = "https://github.com/second-state/wasi_processor", subdirectory = "processor", branch = "main" }
diff --git a/wasmedge-mlx/vlm/README.md b/wasmedge-mlx/vlm/README.md
@@ -26,29 +26,22 @@ cmake --install build
 
 Then you will have an executable `wasmedge` runtime under `/usr/local/bin` and the WASI-NN with MLX backend plug-in under `/usr/local/lib/wasmedge/libwasmedgePluginWasiNN.so` after installation.
 
-## Install dependencies
-
-Currently, we use the Python transformer library to embed the prompt and image to input the token. You can use any other library instead of this step.
-
-``` bash
-sudo apt install python3 python3-pip
-pip install transformers pillow mlx
-```
-
 ## Download the model and tokenizer
 
-In this example, we will use `gemma-3-4b-pt-bf16`.
+In this example, we will use `gemma-3-4b-it-4bit`.
 
 ``` bash
-git clone https://huggingface.co/mlx-community/gemma-3-4b-pt-bf16
+git clone https://huggingface.co/mlx-community/gemma-3-4b-it-4bit
 ```
 
 ## Build wasm
 
-Run the following command to build wasm, the output WASM file will be at `target/wasm32-wasip1/release/`
+Run the following command to build wasm, the output WASM file will be at `target/wasm32-wasip1/release/`.
+Then we use AOT-compiled WASM to improve the performance.
 
 ```bash
 cargo build --target wasm32-wasip1 --release
+wasmedge compiler ./target/wasm32-wasip1/release/wasmedge-vlm.wasm wasmedge-vlm_aot.wasm
 ```
 ## Execute 
 
@@ -58,25 +51,20 @@ Execute the WASM with the `wasmedge` using nn-preload to load model.
 # Download sample image
 wget https://github.com/WasmEdge/WasmEdge/raw/master/docs/wasmedge-runtime-logo.png 
 
-# python encode.py <model_path> <image_path> <prompt>
-python encode.py gemma-3-4b-it-bf16 wasmedge-runtime-logo.png "What is this icon?"
 
 wasmedge --dir .:. \
- --nn-preload default:mlx:AUTO:model.safetensors \
-  ./target/wasm32-wasip1/release/wasmedge-vlm.wasm default
-
-# python encode.py <model_path> <Output mlx array path>
-python decode.py gemma-3-4b-it-bf16 Answer.npy
+--nn-preload default:mlx:AUTO:gemma-3-4b-it-4bit/model.safetensors \
+./wasmedge-vlm_aot.wasm default gemma-3-4b-it-4bit
 
 ```
 
 If your model has multiple weight files, you need to provide all in the nn-preload.
 
 For example:
 ``` bash
-wasmedge --dir .:. \                        
-  --nn-preload default:mlx:AUTO:gemma-3-4b-it-bf16/model-00001-of-00002.safetensors:gemma-3-4b-it-bf16/model-00002-of-00002.safetensors \
-  ./target/wasm32-wasip1/release/wasmedge-vlm.wasm default
+wasmedge --dir .:. \
+--nn-preload default:mlx:AUTO:gemma-3-4b-it-4bit/model-00001-of-00002.safetensors:gemma-3-4b-it-4bit/model-00002-of-00002.safetensors \
+./target/wasm32-wasip1/release/wasmedge-vlm.wasm default
 ```
 
 ## Other 
diff --git a/wasmedge-mlx/vlm/src/main.rs b/wasmedge-mlx/vlm/src/main.rs
@@ -1,10 +1,17 @@
-use serde_json::json;
+use rust_processor::auto::processing_auto::AutoProcessor;
+use rust_processor::gemma3::detokenizer::decode;
+use rust_processor::processor_utils::prepare_inputs;
+use rust_processor::NDTensorI32;
 use std::env;
 use wasmedge_wasi_nn::{
     self, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext, TensorType,
 };
 
-fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> String {
+use serde_json::Value;
+use std::fs::File;
+use std::io::{self, BufReader};
+
+fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> NDTensorI32 {
     // Preserve for 4096 tokens with average token length 6
     const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 6;
     let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE];
@@ -13,44 +20,89 @@ fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> Strin
         .expect("Failed to get output");
     output_size = std::cmp::min(MAX_OUTPUT_BUFFER_SIZE, output_size);
 
-    return String::from_utf8_lossy(&output_buffer[..output_size]).to_string();
+    return NDTensorI32::from_bytes(&output_buffer[..output_size]).unwrap();
 }
 
-fn get_output_from_context(context: &GraphExecutionContext) -> String {
+fn get_output_from_context(context: &GraphExecutionContext) -> NDTensorI32 {
     get_data_from_context(context, 0)
 }
 
+fn read_json(path: &str) -> io::Result<Value> {
+    let file = File::open(path)?;
+    let reader = BufReader::new(file);
+    let v = serde_json::from_reader(reader)
+        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
+    Ok(v)
+}
+
 fn main() {
     // prompt: "What is this icon?";
     // image: "wasmedge-runtime-logo.png";
     let args: Vec<String> = env::args().collect();
     let model_name: &str = &args[1];
+    let model_dir = &args[2];
+    let config = read_json(&format!("{}/config.json", model_dir)).unwrap();
+    let prompt = "<bos><start_of_turn>user\
+    What is this icon?<start_of_image><end_of_turn>\
+    <start_of_turn>model";
+    let image_path = "wasmedge-runtime-logo.png";
+    println!("create processor: {}", model_dir);
+    let mut processor = match AutoProcessor::from_pretrained(model_dir) {
+        Ok(processor) => match processor {
+            rust_processor::auto::processing_auto::AutoProcessorType::Gemma3(processor) => {
+                processor
+            }
+            _ => {
+                eprintln!("Error loading processor: not a Gemma3Processor");
+                return;
+            }
+        },
+        Err(e) => {
+            eprintln!("Error loading processor: {}", e);
+            return;
+        }
+    };
+    println!("processor created");
+    let image_token_index = config["image_token_index"].as_u64().unwrap_or(262144) as u32;
+    let model_inputs = prepare_inputs(
+        &mut processor,
+        &[image_path], // Use single image array
+        prompt,
+        image_token_index,
+        Some((896, 896)), // Use 896x896 as image size
+    );
     let graph = GraphBuilder::new(GraphEncoding::Mlx, ExecutionTarget::AUTO)
-        .config(
-            serde_json::to_string(&json!({"model_type": "gemma3", "max_token":250}))
-                .expect("Failed to serialize options"),
-        )
+        .config(config.to_string())
         .build_from_cache(model_name)
         .expect("Failed to build graph");
 
     let mut context = graph
         .init_execution_context()
         .expect("Failed to init context");
 
-    let tensor_data = "input_ids.npy".as_bytes().to_vec();
+    let tensor_data = model_inputs["input_ids"].to_bytes();
     context
         .set_input(0, TensorType::U8, &[1], &tensor_data)
         .expect("Failed to set input");
-    let tensor_data = "pixel_values.npy".as_bytes().to_vec();
+    let tensor_data = model_inputs["pixel_values"].to_bytes();
     context
         .set_input(1, TensorType::U8, &[1], &tensor_data)
         .expect("Failed to set input");
-    let tensor_data = "mask.npy".as_bytes().to_vec();
+    let tensor_data = model_inputs["mask"].to_bytes();
     context
         .set_input(2, TensorType::U8, &[1], &tensor_data)
         .expect("Failed to set input");
 
     context.compute().expect("Failed to compute");
-    let output = get_output_from_context(&context);
+    let tokens = get_output_from_context(&context);
+    let output = decode(
+        &tokens
+            .data
+            .into_iter()
+            .map(|x| x as usize)
+            .collect::<Vec<_>>(),
+        &processor,
+        true,
+    );
     println!("{}", output.trim());
 }