AXERA-TECH
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 31 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 28 additions & 0 deletions b/‎README.md‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎docs/vision_encoder_patterns.md‎
Lines changed: 182 additions & 0 deletions b/‎docs/vision_encoder_patterns.md‎
Lines changed: 182 additions & 0 deletions
@@ -4,3 +4,6 @@
 [submodule "third_party/openai-api.cpp"]
 	path = third_party/openai-api.cpp
 	url = https://github.com/ZHEQIUSHUI/openai-api.cpp
+[submodule "third_party/SimpleCV"]
+	path = third_party/SimpleCV
+	url = https://github.com/ZHEQIUSHUI/SimpleCV.git
@@ -74,13 +74,36 @@ endif()
 # ---------------------------------------------------------------------------
 add_subdirectory(third_party/tokenizer.axera)
 include_directories(third_party/tokenizer.axera/include)
+include_directories(third_party/magic_enum/include)
 
 include(overlook.cmake)
 
 include_directories(src)
 include_directories(src/runner)
 include_directories(src/runner/utils)
 
+# ---------------------------------------------------------------------------
+# Vision preprocessing (OpenCV)
+# ---------------------------------------------------------------------------
+set(AXLLM_VISION_LIBS "")
+find_package(OpenCV QUIET)
+if(OpenCV_FOUND)
+    message(STATUS "OpenCV found (${OpenCV_VERSION})")
+    add_compile_definitions(AXLLM_USE_OPENCV)
+    include_directories(${OpenCV_INCLUDE_DIRS})
+    set(AXLLM_VISION_LIBS ${OpenCV_LIBS})
+else()
+    # Fallback: SimpleCV (stb-based tiny OpenCV-like lib)
+    if(EXISTS ${CMAKE_SOURCE_DIR}/third_party/SimpleCV/CMakeLists.txt)
+        message(WARNING "OpenCV not found: using third_party/SimpleCV for image preprocessing (may have slight differences vs OpenCV)")
+        set(SIMPLECV_BUILD_TESTS OFF CACHE BOOL "Build SimpleCV tests" FORCE)
+        add_subdirectory(third_party/SimpleCV)
+        set(AXLLM_VISION_LIBS SimpleCV::simplecv)
+    else()
+        message(FATAL_ERROR "OpenCV not found and third_party/SimpleCV is missing. Install OpenCV or init submodule third_party/SimpleCV.")
+    endif()
+endif()
+
 set(OPENAI_API_BUILD_EXAMPLES OFF)
 set(OPENAI_API_BUILD_TESTS OFF)
 add_subdirectory(third_party/openai-api.cpp)
@@ -92,6 +115,10 @@ set(COMMON_SOURCES
     src/runner/utils/memory_utils.cpp
     src/runner/utils/cqdm.cpp
     src/runner/LLM.cpp
+    src/runner/utils/files.cpp
+    src/runner/utils/image_processor.cpp
+    src/runner/utils/mrope.cpp
+    src/runner/vision/vision_module.cpp
 )
 
 # ---------------------------------------------------------------------------
@@ -115,11 +142,13 @@ if(BUILD_AX650)
         -L${BSP_MSP_DIR}/lib
         ax_engine ax_interpreter ax_sys
         pthread tokenizer openai_api::server
+        ${AXLLM_VISION_LIBS}
     )
     target_link_libraries(llm_smoke
         -L${BSP_MSP_DIR}/lib
         ax_engine ax_interpreter ax_sys
         pthread tokenizer openai_api::server
+        ${AXLLM_VISION_LIBS}
     )
     install(TARGETS axllm DESTINATION bin)
     install(TARGETS llm_smoke DESTINATION bin)
@@ -155,6 +184,7 @@ endif()
     target_link_libraries(axllm
         -L${AXCL_DIR}/lib
         ${AXCL_LINK_LIBS}
+        ${AXLLM_VISION_LIBS}
     )
     # AXCL smoke test
     add_executable(llm_smoke tools/llm_smoke.cpp
@@ -167,6 +197,7 @@ endif()
     target_link_libraries(llm_smoke
         -L${AXCL_DIR}/lib
         ${AXCL_LINK_LIBS}
+        ${AXLLM_VISION_LIBS}
     )
     set_target_properties(axllm PROPERTIES LINK_FLAGS "-Wl,--as-needed -Wl,--unresolved-symbols=ignore-in-shared-libs")
 install(TARGETS axllm DESTINATION bin)
 
@@ -16,11 +16,20 @@
 
 ### 已支持模型
 
+#### LLM
 - Qwen2.5
 - Qwen3
 - MiniCPM
 - SmolLM2
 - Llama3
+- HY-MT1.5-1.8B
+- ...
+
+#### VLM（多模态）
+- Qwen3-VL-2B-Instruct
+- SmolVLM2-500M-Video-Instruct
+- FastVLM-1.5B-GPTQ-Int4
+- InternVL3_5-1B-GPTQ-INT4
 - ...
 
 ### 获取地址
@@ -93,6 +102,25 @@ axllm
 
 如需 API/Gradio 示例，可继续使用 `scripts/` 下的脚本（与分支功能一致）。
 
+### VLM 使用说明
+
+- 使用 VLM 模型目录运行 `axllm run <vlm_model_path>`
+- 每轮输入 `prompt` 后，会提示 `image >>`
+  - 直接回车：本轮仅文本对话
+  - 输入图片路径：图文对话
+  - 输入 `video:<frames_dir>`：视频/多帧对话（按文件名排序读取帧）
+
+VLM 模型的 `config.json` 需包含（或等价字段）：
+
+- `vlm_type`
+- `filename_image_encoder_axmodel`
+- `vision_patch_size`
+
+可选字段（建议保留，未配置时会自动从视觉编码模型输入形状推断）：
+
+- `vision_width`
+- `vision_height`
+
 ## 运行示例
 ### 命令行对话
 ```shell
 
@@ -0,0 +1,182 @@
+# Vision/VLM Branch Patterns (Notes)
+
+This repo's `axllm` branch currently runs **text-only** models by:
+
+- `tokenizer->encode(history)` to token ids
+- compute `tokens_diff` vs `last_tokens_ids` (append-only fast path)
+- `SetKVCache(k/v, precompute_len, input_num_token)`
+- build `out_embed` for `tokens_diff` via `embed_selector.getByIndex(id)`
+- `Run(out_embed)` to do prefill/decode
+- `GetKVCache(...)`, append assistant reply, update `last_tokens_ids`
+
+So the "context" (KV cache) support on `axllm` is based on **token-id diff** + caching.
+
+Below are distilled patterns from the VLM branches (Qwen/InternVL/FastVLM/SmolVLM2, incl. multi-frame/video variants).
+
+## 1) How Images/Videos Get Into The Prompt
+
+All VLM branches follow the same high-level idea:
+
+1. The tokenizer's chat template emits **placeholder tokens** for each media item.
+2. The vision encoder produces **per-placeholder embedding vectors**.
+3. The code finds the placeholder positions in `input_ids` and **replaces** the corresponding token embeddings in `out_embed`.
+
+The differences are in (a) which placeholder tokens are used and (b) how to locate them.
+
+### A. Qwen3/Qwen2.5-VL style
+
+- Prompt uses `<|vision_start|> ... <|vision_end|>` wrapping.
+- Inside the vision block, placeholders are repeated:
+  - image: `<|image_pad|>` repeated `num_media * num_media_tokens`
+  - video: `<|video_pad|>` repeated `num_media * num_media_tokens`
+- Injection offsets are typically found by scanning for `vision_start_token_id` and taking `offset=i+1` (first placeholder after start).
+
+### B. InternVL style
+
+- Prompt uses `<img> ... </img>` wrapping.
+- Placeholder token is usually `<IMG_CONTEXT>` repeated `num_media_tokens`.
+- Some templates place `content.data` before placeholders; others after.
+- Injection offsets are found by scanning `input_ids` for `IMAGE_CONTEXT_TOKEN` (or via `<|vision_start|>` then next token).
+
+### C. FastVLM style
+
+- Similar to InternVL: uses a simple placeholder token (e.g. `"<image>"`) repeated.
+- Code scans for a fixed `IMAGE_CONTEXT_TOKEN` id in `input_ids` and overwrites those slots.
+
+### D. SmolVLM2 style
+
+- Placeholder token is `"<image>"` (often a single token id).
+- The template may include additional "header tokens" around the image tokens.
+- Injection offsets are found by detecting **runs** of consecutive `IMAGE_CONTEXT_TOKEN` ids, pushing the first id of each run.
+
+## 2) Vision Encoder Output Shapes And Preprocess
+
+There are two broad encoder IO styles:
+
+### A. "Classic image encoder" (single image -> embedding sequence)
+
+Common in `ax-fastvlm`, `ax-internvl`:
+
+- Determine input layout:
+  - NCHW float input: normalize `(x/255 - mean) / std`, write as `float` into input tensor.
+  - NHWC u8 input: resize + RGB, memcpy into input tensor.
+- Determine output dtype:
+  - if output size matches `elem_count * 2` => bf16 output
+  - if output size matches `elem_count * 4` => fp32 output (then convert to bf16)
+- Result is a flat bf16 array whose length is `(num_media_tokens * tokens_embed_size)` (conceptually).
+
+### B. Qwen-VL "video processor" (frames -> patches -> embedding sequence)
+
+Common in `ax-qwen2_5-vl`, `ax-qwen3-vl`, `axcl-qwen3-vl`:
+
+- Preprocess frames (even for image) via a "video-like" patching pipeline:
+  - resize to `(vision_config.height, vision_config.width)`
+  - RGB
+  - temporal patching (`temporal_patch_size`)
+  - spatial merge (`spatial_merge_size`)
+  - patch size (`patch_size`)
+- Produces `pixel_values` per grid segment (for videos: multiple segments).
+- Each segment is fed to `image_encoder` to produce an embedding block.
+- Tracks `cfg.image_grid_thw` and/or `cfg.video_grid_thw` for mRoPE.
+
+## 3) Extra Side-Inputs Used By Some VLMs
+
+### A. mRoPE / position ids (Qwen-VL)
+
+Qwen-VL branches compute `position_ids` (3 x seq_len) based on:
+
+- `input_ids`
+- `cfg.image_grid_thw` / `cfg.video_grid_thw`
+- vision settings: `spatial_merge_size`, sometimes video time scaling (`second_per_grid_ts`)
+
+These `position_ids` are then used in prefill by writing them into the model's `indices` input
+instead of a simple monotonically increasing index.
+
+### B. deepstack features (some AXCL Qwen-VL variants)
+
+Some image encoders output additional tensors (e.g. 3 "deepstack features").
+During prefill, for tokens where `visual_pos_mask[j] == 1`, the code adds those features
+into the intermediate embedding stream (bf16->fp32 add -> bf16).
+
+### C. visual_pos_mask
+
+Computed from `input_ids` by marking positions that equal `image_token_id` or `video_token_id`.
+Used to align deepstack features with only the visual placeholder positions.
+
+## 4) How Multi-Image / Multi-Frame Is Represented
+
+All branches encode multi-image as:
+
+- `Content{ role=USER, type=IMAGE, data=prompt, num_media=N, num_media_tokens=T }`
+- Tokenizer repeats placeholder tokens `N*T`.
+- Vision encoder returns `N` blocks, each block length `T * tokens_embed_size`.
+
+For video:
+
+- Some branches treat each temporal grid segment as one "media block".
+- Some compute `cfg.video_grid_thw = {{grid_t, grid_h, grid_w}}` and then internally expand.
+
+## 5) Key Gap vs `axllm` (Context Support)
+
+The VLM branches above usually build `input_ids` for the full prompt and then build the full
+`out_embed` (text token embeddings + injected vision embeddings) in one go.
+
+`axllm` is different:
+
+- It only materializes embeddings for `tokens_diff` (incremental tail) to support KV-cache context.
+- It currently has **no hook** to replace placeholder token embeddings with vision embeddings.
+
+So a pluggable image encoder for `axllm` must solve:
+
+- When `tokens_diff` contains placeholder ids, produce embeddings that include:
+  - normal token embeddings for text tokens
+  - vision embeddings for the placeholder slots
+- While keeping the existing "token-id diff + KV cache" logic intact.
+
+Practical implication:
+
+- The "media -> placeholder slots" mapping must be reproducible from `(history, token_ids)` and/or
+  persisted state, so incremental encoding can inject the correct vision embeddings only for the
+  newly appended part of the conversation.
+
+## 6) Abstraction Axes For A Pluggable Vision Module (Proposed)
+
+To unify the branches without changing `axllm`'s main control flow, the pluggable module needs
+to cover these responsibilities:
+
+- `Tokenizer side`:
+  - define placeholder tokens + how many tokens per media item (`num_media_tokens`)
+  - define how to locate placeholder offsets in `input_ids`
+  - (optional) provide `position_ids` generation rules (mRoPE)
+- `Vision side`:
+  - load/init image encoder axmodel(s)
+  - preprocess image/video frames
+  - produce per-media embedding blocks (bf16, `tokens_embed_size` aligned)
+  - (optional) deepstack features
+- `Injection side`:
+  - given `input_ids` and media blocks, produce an "embedding stream" where placeholder slots
+    are overwritten by vision embeddings
+  - for `axllm` context mode: support doing the above for **only the tail** (`tokens_diff`),
+    while keeping placeholder alignment correct.
+
+These notes are intentionally implementation-agnostic so we can use them as a checklist when
+refactoring `axllm` to support LLM + VLM with context.
+
+## 7) Current `axllm` Implementation Notes (This Branch)
+
+This branch adds a pluggable vision module behind a runtime config switch (no compile-time toggle):
+
+- `config.json`: `vlm_type` (or `VLM_TYPE`) selects the vision module.
+- If `vlm_type != "None"(0)`, `filename_image_encoder_axmodel` must be set.
+ - Vision preprocessing backend is selected at **CMake configure time**:
+   - Prefer OpenCV if found.
+   - Otherwise fall back to `third_party/SimpleCV` and print a CMake warning (slight differences vs OpenCV are possible).
+
+VLM runtime data flow (keeps the existing token-diff + KV-cache logic):
+
+- Tokenizer still generates placeholder tokens based on `Content.num_media` and `Content.num_media_tokens`.
+- Vision module prepares a copy of `history` where those two fields are filled, then:
+  - encodes images/videos with `image_encoder.axmodel`
+  - builds a `pos2vision` mapping for placeholder positions in `input_ids`
+  - (Qwen-VL) computes `position_ids` (mRoPE) and a decode start override
+- The LLM loop only builds embeddings for `tokens_diff`, and replaces only the vision placeholder slots in that tail.