feat(inference): vllm support

axel7083 · jeffmaury · commit b7e21db33781 · 2025-05-15T14:38:17.000+02:00
Signed-off-by: axel7083 &lt;42176370+axel7083@users.noreply.github.com&gt;
diff --git a/packages/backend/src/assets/ai.json b/packages/backend/src/assets/ai.json
@@ -526,6 +526,13 @@
       "license": "Apache-2.0",
       "url": "huggingface:/OpenVINO/mistral-7B-instruct-v0.2-int4-ov",
       "backend": "openvino"
+    },
+    {
+      "id": "Qwen/Qwen2-VL-2B-Instruct",
+      "name": "Qwen/Qwen2-VL-2B-Instruct",
+      "description": "Qwen/Qwen2-VL-2B-Instruct",
+      "url": "huggingface:/Qwen/Qwen2-VL-2B-Instruct",
+      "backend": "vllm"
     }
   ],
   "categories": [
diff --git a/packages/backend/src/assets/inference-images.json b/packages/backend/src/assets/inference-images.json
@@ -8,5 +8,8 @@
   },
   "openvino": {
     "default": "quay.io/ramalama/openvino@sha256:670d91cc322933cc4263606459317cd4ca3fcfb16d59a46b11dcd498c2cd7cb5"
+  },
+  "vllm": {
+    "default": "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.8.4"
   }
 }
diff --git a/packages/backend/src/managers/modelsManager.ts b/packages/backend/src/managers/modelsManager.ts
@@ -375,6 +375,8 @@ export class ModelsManager implements Disposable {
     model: ModelInfo,
     labels?: { [key: string]: string },
   ): Promise<string> {
+    console.log('[ModelsManager] upload model', model);
+
     // ensure the model upload is not disabled
     if (this.configurationRegistry.getExtensionConfiguration().modelUploadDisabled) {
       console.warn('The model upload is disabled, this may cause the inference server to take a few minutes to start.');
@@ -392,6 +394,7 @@ export class ModelsManager implements Disposable {
 
     // perform download
     const path = uploader.perform(model.id);
+    console.log('[ModelsManager] path got', path);
     await this.updateModelInfos();
 
     return path;
diff --git a/packages/backend/src/managers/playgroundV2Manager.ts b/packages/backend/src/managers/playgroundV2Manager.ts
@@ -34,6 +34,7 @@ import { McpServerManager } from './playground/McpServerManager';
 import type { ToolSet } from 'ai';
 import { simulateStreamingMiddleware, wrapLanguageModel } from 'ai';
 import { toMcpClients } from '../utils/mcpUtils';
+import { InferenceType } from '@shared/models/IInference';
 
 export class PlaygroundV2Manager implements Disposable {
   readonly #conversationRegistry: ConversationRegistry;
@@ -122,8 +123,10 @@ export class PlaygroundV2Manager implements Disposable {
 
     // create/start inference server if necessary
     const servers = this.inferenceManager.getServers();
+    console.log('servers', servers);
     const server = servers.find(s => s.models.map(mi => mi.id).includes(model.id));
     if (!server) {
+      console.warn(`no server running found with modelId ${model.id}, creating new one`);
       await this.inferenceManager.createInferenceServer(
         await withDefaultConfiguration({
           modelsInfo: [model],
diff --git a/packages/backend/src/models/HuggingFaceModelHandler.ts b/packages/backend/src/models/HuggingFaceModelHandler.ts
@@ -23,6 +23,7 @@ import type { CompletionEvent } from './baseEvent';
 import { getDurationSecondsSince } from '../utils/utils';
 import type { ModelsManager } from '../managers/modelsManager';
 import fs from 'node:fs/promises';
+import { dirname, basename } from 'node:path';
 
 function parseURL(url: string): { repo: string; revision?: string } | undefined {
   const u = URL.parse(url);
diff --git a/packages/backend/src/studio.ts b/packages/backend/src/studio.ts
@@ -64,6 +64,7 @@ import { LLAMA_STACK_API_CHANNEL, type LlamaStackAPI } from '@shared/LlamaStackA
 import { LlamaStackManager } from './managers/llama-stack/llamaStackManager';
 import { OpenVINO } from './workers/provider/OpenVINO';
 import os from 'node:os';
+import { VLLM } from './workers/provider/VLLM';
 
 export class Studio {
   readonly #extensionContext: ExtensionContext;
@@ -289,6 +290,9 @@ export class Studio {
         ),
       );
     }
+    this.#extensionContext.subscriptions.push(
+      this.#inferenceProviderRegistry.register(new VLLM(this.#taskRegistry, this.#podmanConnection)),
+    );
 
     /**
      * The inference manager create, stop, manage Inference servers
diff --git a/packages/backend/src/workers/provider/VLLM.ts b/packages/backend/src/workers/provider/VLLM.ts
@@ -0,0 +1,162 @@
+/**********************************************************************
+ * Copyright (C) 2024 Red Hat, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ***********************************************************************/
+
+import { InferenceProvider } from './InferenceProvider';
+import type { TaskRegistry } from '../../registries/TaskRegistry';
+import type { PodmanConnection } from '../../managers/podmanConnection';
+import { type InferenceServer, InferenceType } from '@shared/models/IInference';
+import type { InferenceServerConfig } from '@shared/models/InferenceServerConfig';
+import type { ContainerProviderConnection, MountConfig } from '@podman-desktop/api';
+import * as images from '../../assets/inference-images.json';
+import { LABEL_INFERENCE_SERVER } from '../../utils/inferenceUtils';
+import { DISABLE_SELINUX_LABEL_SECURITY_OPTION } from '../../utils/utils';
+import { basename, dirname } from 'node:path';
+import { join as joinposix } from 'node:path/posix';
+import { getLocalModelFile } from '../../utils/modelsUtils';
+import { SECOND } from './LlamaCppPython';
+
+export class VLLM extends InferenceProvider {
+  constructor(
+    taskRegistry: TaskRegistry,
+    private podmanConnection: PodmanConnection,
+  ) {
+    super(taskRegistry, InferenceType.VLLM, 'vllm');
+  }
+
+  dispose(): void {}
+
+  public enabled = (): boolean => true;
+
+  /**
+   * Here is an example
+   *
+   * podman run -it --rm
+   *  -v C:\Users\axels\.cache\huggingface\hub\models--mistralai--Mistral-7B-v0.1:/cache/models--mistralai--Mistral-7B-v0.1
+   *  -e HF_HUB_CACHE=/cache
+   *  localhost/vllm-cpu-env:latest
+   *  --model=/cache/models--mistralai--Mistral-7B-v0.1/snapshots/7231864981174d9bee8c7687c24c8344414eae6b
+   *
+   * @param config
+   */
+  override async perform(config: InferenceServerConfig): Promise<InferenceServer> {
+    if (config.modelsInfo.length !== 1)
+      throw new Error(`only one model is supported, received ${config.modelsInfo.length}`);
+
+    const modelInfo = config.modelsInfo[0];
+    if (modelInfo.backend !== InferenceType.VLLM) {
+      throw new Error(`VLLM requires models with backend type ${InferenceType.VLLM} got ${modelInfo.backend}.`);
+    }
+
+    if (modelInfo.file === undefined) {
+      throw new Error('The model info file provided is undefined');
+    }
+
+    console.log('[VLLM]', config);
+    console.log('[VLLM] modelInfo.file', modelInfo.file.path);
+
+    // something ~/.cache/huggingface/hub/models--facebook--opt-125m/snapshots
+    // modelInfo.file.path
+
+    const fullPath = getLocalModelFile(modelInfo);
+
+    // modelInfo.file.path must be under the form $(HF_HUB_CACHE)/<repo-type>--<repo-id>/snapshots/<commit-hash>
+    const parent = dirname(fullPath);
+    const commitHash = basename(fullPath);
+    const name = basename(parent);
+    if (name !== 'snapshots') throw new Error('you must provide snapshot path for vllm');
+    const modelCache = dirname(parent);
+
+    let connection: ContainerProviderConnection | undefined;
+    if (config.connection) {
+      connection = this.podmanConnection.getContainerProviderConnection(config.connection);
+    } else {
+      connection = this.podmanConnection.findRunningContainerProviderConnection();
+    }
+
+    if (!connection) throw new Error('no running connection could be found');
+
+    const labels: Record<string, string> = {
+      ...config.labels,
+      [LABEL_INFERENCE_SERVER]: JSON.stringify(config.modelsInfo.map(model => model.id)),
+    };
+
+    const imageInfo = await this.pullImage(connection, config.image ?? images.vllm.default, labels);
+    // https://huggingface.co/docs/transformers/main/en/installation#offline-mode
+    // HF_HUB_OFFLINE in main
+    // TRANSFORMERS_OFFLINE for legacy
+    const envs: string[] = [`HF_HUB_CACHE=/cache`, 'TRANSFORMERS_OFFLINE=1', 'HF_HUB_OFFLINE=1'];
+
+    labels['api'] = `http://localhost:${config.port}/inference`;
+
+    const mounts: MountConfig = [
+      {
+        Target: `/cache/${modelInfo.id}`,
+        Source: modelCache,
+        Type: 'bind',
+      },
+    ];
+
+    const containerInfo = await this.createContainer(
+      imageInfo.engineId,
+      {
+        Image: imageInfo.Id,
+        Detach: true,
+        Labels: labels,
+        HostConfig: {
+          AutoRemove: false,
+          Mounts: mounts,
+          PortBindings: {
+            '8000/tcp': [
+              {
+                HostPort: `${config.port}`,
+              },
+            ],
+          },
+          SecurityOpt: [DISABLE_SELINUX_LABEL_SECURITY_OPTION],
+        },
+        HealthCheck: {
+          // must be the port INSIDE the container not the exposed one
+          Test: ['CMD-SHELL', `curl -sSf localhost:8000/version > /dev/null`],
+          Interval: SECOND * 5,
+          Retries: 4 * 5,
+        },
+        Env: envs,
+        Cmd: [
+          `--model=${joinposix('/cache', modelInfo.id, 'snapshots', commitHash)}`,
+          `--served_model_name=${modelInfo.file.file}`,
+          '--chat-template-content-format=openai',
+        ],
+      },
+      labels,
+    );
+
+    return {
+      models: [modelInfo],
+      status: 'running',
+      connection: {
+        port: config.port,
+      },
+      container: {
+        containerId: containerInfo.id,
+        engineId: containerInfo.engineId,
+      },
+      type: InferenceType.VLLM,
+      labels: labels,
+    };
+  }
+}
diff --git a/packages/shared/src/models/IInference.ts b/packages/shared/src/models/IInference.ts
@@ -21,6 +21,7 @@ export enum InferenceType {
   LLAMA_CPP = 'llama-cpp',
   WHISPER_CPP = 'whisper-cpp',
   OPENVINO = 'openvino',
+  VLLM = 'vllm',
   NONE = 'none',
 }
 

Original file line number	Diff line number	Diff line change
`@@ -8,5 +8,8 @@`
`8`	`8`	`},`
`9`	`9`	`"openvino": {`
`10`	`10`	`"default": "quay.io/ramalama/openvino@sha256:670d91cc322933cc4263606459317cd4ca3fcfb16d59a46b11dcd498c2cd7cb5"`
	`11`	`+ },`
	`12`	`+ "vllm": {`
	`13`	`+ "default": "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.8.4"`
`11`	`14`	`}`
`12`	`15`	`}`
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ export enum InferenceType {`
`21`	`21`	`LLAMA_CPP = 'llama-cpp',`
`22`	`22`	`WHISPER_CPP = 'whisper-cpp',`
`23`	`23`	`OPENVINO = 'openvino',`
	`24`	`+ VLLM = 'vllm',`
`24`	`25`	`NONE = 'none',`
`25`	`26`	`}`
`26`	`27`