Skip to content

Commit be2d818

Browse files
committed
feat: add pluggable vision module for VLM/LLM with context support
1 parent af18354 commit be2d818

File tree

23 files changed

+4392
-34
lines changed

23 files changed

+4392
-34
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@
44
[submodule "third_party/openai-api.cpp"]
55
path = third_party/openai-api.cpp
66
url = https://github.com/ZHEQIUSHUI/openai-api.cpp
7+
[submodule "third_party/SimpleCV"]
8+
path = third_party/SimpleCV
9+
url = https://github.com/ZHEQIUSHUI/SimpleCV.git

CMakeLists.txt

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,13 +74,36 @@ endif()
7474
# ---------------------------------------------------------------------------
7575
add_subdirectory(third_party/tokenizer.axera)
7676
include_directories(third_party/tokenizer.axera/include)
77+
include_directories(third_party/magic_enum/include)
7778

7879
include(overlook.cmake)
7980

8081
include_directories(src)
8182
include_directories(src/runner)
8283
include_directories(src/runner/utils)
8384

85+
# ---------------------------------------------------------------------------
86+
# Vision preprocessing (OpenCV)
87+
# ---------------------------------------------------------------------------
88+
set(AXLLM_VISION_LIBS "")
89+
find_package(OpenCV QUIET)
90+
if(OpenCV_FOUND)
91+
message(STATUS "OpenCV found (${OpenCV_VERSION})")
92+
add_compile_definitions(AXLLM_USE_OPENCV)
93+
include_directories(${OpenCV_INCLUDE_DIRS})
94+
set(AXLLM_VISION_LIBS ${OpenCV_LIBS})
95+
else()
96+
# Fallback: SimpleCV (stb-based tiny OpenCV-like lib)
97+
if(EXISTS ${CMAKE_SOURCE_DIR}/third_party/SimpleCV/CMakeLists.txt)
98+
message(WARNING "OpenCV not found: using third_party/SimpleCV for image preprocessing (may have slight differences vs OpenCV)")
99+
set(SIMPLECV_BUILD_TESTS OFF CACHE BOOL "Build SimpleCV tests" FORCE)
100+
add_subdirectory(third_party/SimpleCV)
101+
set(AXLLM_VISION_LIBS SimpleCV::simplecv)
102+
else()
103+
message(FATAL_ERROR "OpenCV not found and third_party/SimpleCV is missing. Install OpenCV or init submodule third_party/SimpleCV.")
104+
endif()
105+
endif()
106+
84107
set(OPENAI_API_BUILD_EXAMPLES OFF)
85108
set(OPENAI_API_BUILD_TESTS OFF)
86109
add_subdirectory(third_party/openai-api.cpp)
@@ -92,6 +115,10 @@ set(COMMON_SOURCES
92115
src/runner/utils/memory_utils.cpp
93116
src/runner/utils/cqdm.cpp
94117
src/runner/LLM.cpp
118+
src/runner/utils/files.cpp
119+
src/runner/utils/image_processor.cpp
120+
src/runner/utils/mrope.cpp
121+
src/runner/vision/vision_module.cpp
95122
)
96123

97124
# ---------------------------------------------------------------------------
@@ -115,11 +142,13 @@ if(BUILD_AX650)
115142
-L${BSP_MSP_DIR}/lib
116143
ax_engine ax_interpreter ax_sys
117144
pthread tokenizer openai_api::server
145+
${AXLLM_VISION_LIBS}
118146
)
119147
target_link_libraries(llm_smoke
120148
-L${BSP_MSP_DIR}/lib
121149
ax_engine ax_interpreter ax_sys
122150
pthread tokenizer openai_api::server
151+
${AXLLM_VISION_LIBS}
123152
)
124153
install(TARGETS axllm DESTINATION bin)
125154
install(TARGETS llm_smoke DESTINATION bin)
@@ -155,6 +184,7 @@ endif()
155184
target_link_libraries(axllm
156185
-L${AXCL_DIR}/lib
157186
${AXCL_LINK_LIBS}
187+
${AXLLM_VISION_LIBS}
158188
)
159189
# AXCL smoke test
160190
add_executable(llm_smoke tools/llm_smoke.cpp
@@ -167,6 +197,7 @@ endif()
167197
target_link_libraries(llm_smoke
168198
-L${AXCL_DIR}/lib
169199
${AXCL_LINK_LIBS}
200+
${AXLLM_VISION_LIBS}
170201
)
171202
set_target_properties(axllm PROPERTIES LINK_FLAGS "-Wl,--as-needed -Wl,--unresolved-symbols=ignore-in-shared-libs")
172203
install(TARGETS axllm DESTINATION bin)

README.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,20 @@
1616

1717
### 已支持模型
1818

19+
#### LLM
1920
- Qwen2.5
2021
- Qwen3
2122
- MiniCPM
2223
- SmolLM2
2324
- Llama3
25+
- HY-MT1.5-1.8B
26+
- ...
27+
28+
#### VLM(多模态)
29+
- Qwen3-VL-2B-Instruct
30+
- SmolVLM2-500M-Video-Instruct
31+
- FastVLM-1.5B-GPTQ-Int4
32+
- InternVL3_5-1B-GPTQ-INT4
2433
- ...
2534

2635
### 获取地址
@@ -93,6 +102,25 @@ axllm
93102

94103
如需 API/Gradio 示例,可继续使用 `scripts/` 下的脚本(与分支功能一致)。
95104

105+
### VLM 使用说明
106+
107+
- 使用 VLM 模型目录运行 `axllm run <vlm_model_path>`
108+
- 每轮输入 `prompt` 后,会提示 `image >>`
109+
- 直接回车:本轮仅文本对话
110+
- 输入图片路径:图文对话
111+
- 输入 `video:<frames_dir>`:视频/多帧对话(按文件名排序读取帧)
112+
113+
VLM 模型的 `config.json` 需包含(或等价字段):
114+
115+
- `vlm_type`
116+
- `filename_image_encoder_axmodel`
117+
- `vision_patch_size`
118+
119+
可选字段(建议保留,未配置时会自动从视觉编码模型输入形状推断):
120+
121+
- `vision_width`
122+
- `vision_height`
123+
96124
## 运行示例
97125
### 命令行对话
98126
```shell

docs/vision_encoder_patterns.md

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
# Vision/VLM Branch Patterns (Notes)
2+
3+
This repo's `axllm` branch currently runs **text-only** models by:
4+
5+
- `tokenizer->encode(history)` to token ids
6+
- compute `tokens_diff` vs `last_tokens_ids` (append-only fast path)
7+
- `SetKVCache(k/v, precompute_len, input_num_token)`
8+
- build `out_embed` for `tokens_diff` via `embed_selector.getByIndex(id)`
9+
- `Run(out_embed)` to do prefill/decode
10+
- `GetKVCache(...)`, append assistant reply, update `last_tokens_ids`
11+
12+
So the "context" (KV cache) support on `axllm` is based on **token-id diff** + caching.
13+
14+
Below are distilled patterns from the VLM branches (Qwen/InternVL/FastVLM/SmolVLM2, incl. multi-frame/video variants).
15+
16+
## 1) How Images/Videos Get Into The Prompt
17+
18+
All VLM branches follow the same high-level idea:
19+
20+
1. The tokenizer's chat template emits **placeholder tokens** for each media item.
21+
2. The vision encoder produces **per-placeholder embedding vectors**.
22+
3. The code finds the placeholder positions in `input_ids` and **replaces** the corresponding token embeddings in `out_embed`.
23+
24+
The differences are in (a) which placeholder tokens are used and (b) how to locate them.
25+
26+
### A. Qwen3/Qwen2.5-VL style
27+
28+
- Prompt uses `<|vision_start|> ... <|vision_end|>` wrapping.
29+
- Inside the vision block, placeholders are repeated:
30+
- image: `<|image_pad|>` repeated `num_media * num_media_tokens`
31+
- video: `<|video_pad|>` repeated `num_media * num_media_tokens`
32+
- Injection offsets are typically found by scanning for `vision_start_token_id` and taking `offset=i+1` (first placeholder after start).
33+
34+
### B. InternVL style
35+
36+
- Prompt uses `<img> ... </img>` wrapping.
37+
- Placeholder token is usually `<IMG_CONTEXT>` repeated `num_media_tokens`.
38+
- Some templates place `content.data` before placeholders; others after.
39+
- Injection offsets are found by scanning `input_ids` for `IMAGE_CONTEXT_TOKEN` (or via `<|vision_start|>` then next token).
40+
41+
### C. FastVLM style
42+
43+
- Similar to InternVL: uses a simple placeholder token (e.g. `"<image>"`) repeated.
44+
- Code scans for a fixed `IMAGE_CONTEXT_TOKEN` id in `input_ids` and overwrites those slots.
45+
46+
### D. SmolVLM2 style
47+
48+
- Placeholder token is `"<image>"` (often a single token id).
49+
- The template may include additional "header tokens" around the image tokens.
50+
- Injection offsets are found by detecting **runs** of consecutive `IMAGE_CONTEXT_TOKEN` ids, pushing the first id of each run.
51+
52+
## 2) Vision Encoder Output Shapes And Preprocess
53+
54+
There are two broad encoder IO styles:
55+
56+
### A. "Classic image encoder" (single image -> embedding sequence)
57+
58+
Common in `ax-fastvlm`, `ax-internvl`:
59+
60+
- Determine input layout:
61+
- NCHW float input: normalize `(x/255 - mean) / std`, write as `float` into input tensor.
62+
- NHWC u8 input: resize + RGB, memcpy into input tensor.
63+
- Determine output dtype:
64+
- if output size matches `elem_count * 2` => bf16 output
65+
- if output size matches `elem_count * 4` => fp32 output (then convert to bf16)
66+
- Result is a flat bf16 array whose length is `(num_media_tokens * tokens_embed_size)` (conceptually).
67+
68+
### B. Qwen-VL "video processor" (frames -> patches -> embedding sequence)
69+
70+
Common in `ax-qwen2_5-vl`, `ax-qwen3-vl`, `axcl-qwen3-vl`:
71+
72+
- Preprocess frames (even for image) via a "video-like" patching pipeline:
73+
- resize to `(vision_config.height, vision_config.width)`
74+
- RGB
75+
- temporal patching (`temporal_patch_size`)
76+
- spatial merge (`spatial_merge_size`)
77+
- patch size (`patch_size`)
78+
- Produces `pixel_values` per grid segment (for videos: multiple segments).
79+
- Each segment is fed to `image_encoder` to produce an embedding block.
80+
- Tracks `cfg.image_grid_thw` and/or `cfg.video_grid_thw` for mRoPE.
81+
82+
## 3) Extra Side-Inputs Used By Some VLMs
83+
84+
### A. mRoPE / position ids (Qwen-VL)
85+
86+
Qwen-VL branches compute `position_ids` (3 x seq_len) based on:
87+
88+
- `input_ids`
89+
- `cfg.image_grid_thw` / `cfg.video_grid_thw`
90+
- vision settings: `spatial_merge_size`, sometimes video time scaling (`second_per_grid_ts`)
91+
92+
These `position_ids` are then used in prefill by writing them into the model's `indices` input
93+
instead of a simple monotonically increasing index.
94+
95+
### B. deepstack features (some AXCL Qwen-VL variants)
96+
97+
Some image encoders output additional tensors (e.g. 3 "deepstack features").
98+
During prefill, for tokens where `visual_pos_mask[j] == 1`, the code adds those features
99+
into the intermediate embedding stream (bf16->fp32 add -> bf16).
100+
101+
### C. visual_pos_mask
102+
103+
Computed from `input_ids` by marking positions that equal `image_token_id` or `video_token_id`.
104+
Used to align deepstack features with only the visual placeholder positions.
105+
106+
## 4) How Multi-Image / Multi-Frame Is Represented
107+
108+
All branches encode multi-image as:
109+
110+
- `Content{ role=USER, type=IMAGE, data=prompt, num_media=N, num_media_tokens=T }`
111+
- Tokenizer repeats placeholder tokens `N*T`.
112+
- Vision encoder returns `N` blocks, each block length `T * tokens_embed_size`.
113+
114+
For video:
115+
116+
- Some branches treat each temporal grid segment as one "media block".
117+
- Some compute `cfg.video_grid_thw = {{grid_t, grid_h, grid_w}}` and then internally expand.
118+
119+
## 5) Key Gap vs `axllm` (Context Support)
120+
121+
The VLM branches above usually build `input_ids` for the full prompt and then build the full
122+
`out_embed` (text token embeddings + injected vision embeddings) in one go.
123+
124+
`axllm` is different:
125+
126+
- It only materializes embeddings for `tokens_diff` (incremental tail) to support KV-cache context.
127+
- It currently has **no hook** to replace placeholder token embeddings with vision embeddings.
128+
129+
So a pluggable image encoder for `axllm` must solve:
130+
131+
- When `tokens_diff` contains placeholder ids, produce embeddings that include:
132+
- normal token embeddings for text tokens
133+
- vision embeddings for the placeholder slots
134+
- While keeping the existing "token-id diff + KV cache" logic intact.
135+
136+
Practical implication:
137+
138+
- The "media -> placeholder slots" mapping must be reproducible from `(history, token_ids)` and/or
139+
persisted state, so incremental encoding can inject the correct vision embeddings only for the
140+
newly appended part of the conversation.
141+
142+
## 6) Abstraction Axes For A Pluggable Vision Module (Proposed)
143+
144+
To unify the branches without changing `axllm`'s main control flow, the pluggable module needs
145+
to cover these responsibilities:
146+
147+
- `Tokenizer side`:
148+
- define placeholder tokens + how many tokens per media item (`num_media_tokens`)
149+
- define how to locate placeholder offsets in `input_ids`
150+
- (optional) provide `position_ids` generation rules (mRoPE)
151+
- `Vision side`:
152+
- load/init image encoder axmodel(s)
153+
- preprocess image/video frames
154+
- produce per-media embedding blocks (bf16, `tokens_embed_size` aligned)
155+
- (optional) deepstack features
156+
- `Injection side`:
157+
- given `input_ids` and media blocks, produce an "embedding stream" where placeholder slots
158+
are overwritten by vision embeddings
159+
- for `axllm` context mode: support doing the above for **only the tail** (`tokens_diff`),
160+
while keeping placeholder alignment correct.
161+
162+
These notes are intentionally implementation-agnostic so we can use them as a checklist when
163+
refactoring `axllm` to support LLM + VLM with context.
164+
165+
## 7) Current `axllm` Implementation Notes (This Branch)
166+
167+
This branch adds a pluggable vision module behind a runtime config switch (no compile-time toggle):
168+
169+
- `config.json`: `vlm_type` (or `VLM_TYPE`) selects the vision module.
170+
- If `vlm_type != "None"(0)`, `filename_image_encoder_axmodel` must be set.
171+
- Vision preprocessing backend is selected at **CMake configure time**:
172+
- Prefer OpenCV if found.
173+
- Otherwise fall back to `third_party/SimpleCV` and print a CMake warning (slight differences vs OpenCV are possible).
174+
175+
VLM runtime data flow (keeps the existing token-diff + KV-cache logic):
176+
177+
- Tokenizer still generates placeholder tokens based on `Content.num_media` and `Content.num_media_tokens`.
178+
- Vision module prepares a copy of `history` where those two fields are filled, then:
179+
- encodes images/videos with `image_encoder.axmodel`
180+
- builds a `pos2vision` mapping for placeholder positions in `input_ids`
181+
- (Qwen-VL) computes `position_ids` (mRoPE) and a decode start override
182+
- The LLM loop only builds embeddings for `tokens_diff`, and replaces only the vision placeholder slots in that tail.

0 commit comments

Comments
 (0)