From d536384c9fa31b0863f06ef053c7085d69464fef Mon Sep 17 00:00:00 2001 From: David Irvine Date: Sun, 15 Mar 2026 17:51:40 +0000 Subject: [PATCH] Fix Qwen35 VLM crash on text-only inference (SmallVector out of range) Qwen35Language.LanguageModel.callAsFunction assumes inputs is always 2D [batch, seq], but text-only callers like WiredMemoryUtils.tune and TokenIterator can pass 1D [seq] token arrays. This causes getRopeIndex() and subsequent dim(1) calls to crash with "SmallVector out of range" when accessing a non-existent dimension. Add an ndim check at the top of callAsFunction to expand 1D inputs to 2D before any dimension-dependent logic runs. Fixes #148 --- Libraries/MLXVLM/Models/Qwen35.swift | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Libraries/MLXVLM/Models/Qwen35.swift b/Libraries/MLXVLM/Models/Qwen35.swift index 8a3c3984..d0909da4 100644 --- a/Libraries/MLXVLM/Models/Qwen35.swift +++ b/Libraries/MLXVLM/Models/Qwen35.swift @@ -921,6 +921,10 @@ enum Qwen35Language { imageGridTHW: [THW]? = nil, videoGridTHW: [THW]? = nil ) -> LMOutput { + // Ensure inputs is 2D [batch, seq]. Text-only callers (e.g. + // WiredMemoryUtils, TokenIterator) may pass 1D token arrays. + let inputs = inputs.ndim == 1 ? inputs.expandedDimensions(axis: 0) : inputs + if pixelValues != nil { precomputedPositionIds = nil ropeDeltas = nil