diff --git a/Libraries/MLXVLM/Models/Qwen35.swift b/Libraries/MLXVLM/Models/Qwen35.swift index 8a3c3984..d0909da4 100644 --- a/Libraries/MLXVLM/Models/Qwen35.swift +++ b/Libraries/MLXVLM/Models/Qwen35.swift @@ -921,6 +921,10 @@ enum Qwen35Language { imageGridTHW: [THW]? = nil, videoGridTHW: [THW]? = nil ) -> LMOutput { + // Ensure inputs is 2D [batch, seq]. Text-only callers (e.g. + // WiredMemoryUtils, TokenIterator) may pass 1D token arrays. + let inputs = inputs.ndim == 1 ? inputs.expandedDimensions(axis: 0) : inputs + if pixelValues != nil { precomputedPositionIds = nil ropeDeltas = nil