feat(inference): vllm support

axel7083 · axel7083 · commit 1223bd995a2d · 2025-04-18T17:11:54.000+02:00
Signed-off-by: axel7083 &lt;42176370+axel7083@users.noreply.github.com&gt;
diff --git a/packages/backend/src/assets/ai.json b/packages/backend/src/assets/ai.json
@@ -490,6 +490,13 @@
       },
       "memory": 4372811936,
       "backend": "llama-cpp"
+    },
+    {
+      "id": "Qwen/Qwen2-VL-2B-Instruct",
+      "name": "Qwen/Qwen2-VL-2B-Instruct",
+      "description": "Qwen/Qwen2-VL-2B-Instruct",
+      "url": "huggingface:/Qwen/Qwen2-VL-2B-Instruct",
+      "backend": "vllm"
     }
   ],
   "categories": [
diff --git a/packages/backend/src/assets/inference-images.json b/packages/backend/src/assets/inference-images.json
@@ -5,5 +5,8 @@
   "llamacpp": {
     "default": "quay.io/ramalama/ramalama-llama-server@sha256:cbadb36fbbc2abf9867a33e6dfe3f2df4a76774259b5d4d24d50f4fc7e525406",
     "cuda": "quay.io/ramalama/cuda-llama-server@sha256:56efc824e5b3ae6a6a11e9537ed9e2ac05f9f9fc6f2e81a55eb67b662c94fe95"
+  },
+  "vllm": {
+    "default": "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.8.4"
   }
 }
diff --git a/packages/backend/src/managers/modelsManager.ts b/packages/backend/src/managers/modelsManager.ts
@@ -375,6 +375,8 @@ export class ModelsManager implements Disposable {
     model: ModelInfo,
     labels?: { [key: string]: string },
   ): Promise<string> {
+    console.log('[ModelsManager] upload model', model);
+
     // ensure the model upload is not disabled
     if (this.configurationRegistry.getExtensionConfiguration().modelUploadDisabled) {
       console.warn('The model upload is disabled, this may cause the inference server to take a few minutes to start.');
@@ -392,6 +394,7 @@ export class ModelsManager implements Disposable {
 
     // perform download
     const path = uploader.perform(model.id);
+    console.log('[ModelsManager] path got', path);
     await this.updateModelInfos();
 
     return path;
diff --git a/packages/backend/src/managers/playgroundV2Manager.ts b/packages/backend/src/managers/playgroundV2Manager.ts
@@ -38,6 +38,7 @@ import type { TaskRegistry } from '../registries/TaskRegistry';
 import type { CancellationTokenRegistry } from '../registries/CancellationTokenRegistry';
 import { getHash } from '../utils/sha';
 import type { RpcExtension } from '@shared/messages/MessageProxy';
+import { InferenceType } from '@shared/models/IInference';
 
 export class PlaygroundV2Manager implements Disposable {
   #conversationRegistry: ConversationRegistry;
@@ -123,8 +124,10 @@ export class PlaygroundV2Manager implements Disposable {
 
     // create/start inference server if necessary
     const servers = this.inferenceManager.getServers();
+    console.log('servers', servers);
     const server = servers.find(s => s.models.map(mi => mi.id).includes(model.id));
     if (!server) {
+      console.warn(`no server running found with modelId ${model.id}, creating new one`);
       await this.inferenceManager.createInferenceServer(
         await withDefaultConfiguration({
           modelsInfo: [model],
@@ -239,13 +242,18 @@ export class PlaygroundV2Manager implements Disposable {
       abortController.abort('cancel');
     });
 
+    const messages = this.getFormattedMessages(conversation.id);
+    console.log('[PlaygroundV2Manager] messages', messages);
+    console.log('[PlaygroundV2Manager] messages', options);
+
     client.chat.completions
       .create(
         {
-          messages: this.getFormattedMessages(conversation.id),
+          messages: messages,
           stream: true,
           model: modelInfo.file.file,
-          ...options,
+          // vllm is not compatible with options provided, only llamacpp is
+          ...(server.type === InferenceType.LLAMA_CPP ? options : {}),
         },
         {
           signal: abortController.signal,
@@ -333,8 +341,8 @@ export class PlaygroundV2Manager implements Disposable {
       .map(
         message =>
           ({
-            name: undefined,
-            ...message,
+            role: message.role,
+            content: message.content,
           }) as ChatCompletionMessageParam,
       );
   }
diff --git a/packages/backend/src/models/HuggingFaceModelHandler.ts b/packages/backend/src/models/HuggingFaceModelHandler.ts
@@ -23,6 +23,7 @@ import type { CompletionEvent } from './baseEvent';
 import { getDurationSecondsSince } from '../utils/utils';
 import type { ModelsManager } from '../managers/modelsManager';
 import fs from 'node:fs/promises';
+import { dirname, basename } from 'node:path';
 
 function parseURL(url: string): { repo: string; revision?: string } | undefined {
   const u = URL.parse(url);
@@ -128,8 +129,8 @@ export class HuggingFaceModelHandler extends ModelHandler {
               const model = hfModels.find(m => m.repo?.repo === repo.id.name && m.repo?.revision === ref);
               if (model) {
                 model.model.file = {
-                  path: revision.path,
-                  file: revision.path,
+                  path: dirname(revision.path),
+                  file: basename(revision.path),
                   creation: revision.lastModifiedAt,
                   size: revision.size,
                 };
diff --git a/packages/backend/src/studio.ts b/packages/backend/src/studio.ts
@@ -62,6 +62,7 @@ import { HuggingFaceModelHandler } from './models/HuggingFaceModelHandler';
 import { LlamaStackApiImpl } from './llama-stack-api-impl';
 import { LLAMA_STACK_API_CHANNEL, type LlamaStackAPI } from '@shared/LlamaStackAPI';
 import { LlamaStackManager } from './managers/llama-stack/llamaStackManager';
+import { VLLM } from './workers/provider/VLLM';
 
 export class Studio {
   readonly #extensionContext: ExtensionContext;
@@ -280,6 +281,9 @@ export class Studio {
     this.#extensionContext.subscriptions.push(
       this.#inferenceProviderRegistry.register(new WhisperCpp(this.#taskRegistry, this.#podmanConnection)),
     );
+    this.#extensionContext.subscriptions.push(
+      this.#inferenceProviderRegistry.register(new VLLM(this.#taskRegistry, this.#podmanConnection)),
+    );
 
     /**
      * The inference manager create, stop, manage Inference servers
diff --git a/packages/backend/src/workers/provider/VLLM.ts b/packages/backend/src/workers/provider/VLLM.ts
@@ -0,0 +1,162 @@
+/**********************************************************************
+ * Copyright (C) 2024 Red Hat, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ***********************************************************************/
+
+import { InferenceProvider } from './InferenceProvider';
+import type { TaskRegistry } from '../../registries/TaskRegistry';
+import type { PodmanConnection } from '../../managers/podmanConnection';
+import { type InferenceServer, InferenceType } from '@shared/models/IInference';
+import type { InferenceServerConfig } from '@shared/models/InferenceServerConfig';
+import type { ContainerProviderConnection, MountConfig } from '@podman-desktop/api';
+import * as images from '../../assets/inference-images.json';
+import { LABEL_INFERENCE_SERVER } from '../../utils/inferenceUtils';
+import { DISABLE_SELINUX_LABEL_SECURITY_OPTION } from '../../utils/utils';
+import { basename, dirname } from 'node:path';
+import { join as joinposix } from 'node:path/posix';
+import { getLocalModelFile } from '../../utils/modelsUtils';
+import { SECOND } from './LlamaCppPython';
+
+export class VLLM extends InferenceProvider {
+  constructor(
+    taskRegistry: TaskRegistry,
+    private podmanConnection: PodmanConnection,
+  ) {
+    super(taskRegistry, InferenceType.VLLM, 'vllm');
+  }
+
+  dispose(): void {}
+
+  public enabled = (): boolean => true;
+
+  /**
+   * Here is an example
+   *
+   * podman run -it --rm
+   *  -v C:\Users\axels\.cache\huggingface\hub\models--mistralai--Mistral-7B-v0.1:/cache/models--mistralai--Mistral-7B-v0.1
+   *  -e HF_HUB_CACHE=/cache
+   *  localhost/vllm-cpu-env:latest
+   *  --model=/cache/models--mistralai--Mistral-7B-v0.1/snapshots/7231864981174d9bee8c7687c24c8344414eae6b
+   *
+   * @param config
+   */
+  override async perform(config: InferenceServerConfig): Promise<InferenceServer> {
+    if (config.modelsInfo.length !== 1)
+      throw new Error(`only one model is supported, received ${config.modelsInfo.length}`);
+
+    const modelInfo = config.modelsInfo[0];
+    if (modelInfo.backend !== InferenceType.VLLM) {
+      throw new Error(`VLLM requires models with backend type ${InferenceType.VLLM} got ${modelInfo.backend}.`);
+    }
+
+    if (modelInfo.file === undefined) {
+      throw new Error('The model info file provided is undefined');
+    }
+
+    console.log('[VLLM]', config);
+    console.log('[VLLM] modelInfo.file', modelInfo.file.path);
+
+    // something ~/.cache/huggingface/hub/models--facebook--opt-125m/snapshots
+    // modelInfo.file.path
+
+    const fullPath = getLocalModelFile(modelInfo);
+
+    // modelInfo.file.path must be under the form $(HF_HUB_CACHE)/<repo-type>--<repo-id>/snapshots/<commit-hash>
+    const parent = dirname(fullPath);
+    const commitHash = basename(fullPath);
+    const name = basename(parent);
+    if (name !== 'snapshots') throw new Error('you must provide snapshot path for vllm');
+    const modelCache = dirname(parent);
+
+    let connection: ContainerProviderConnection | undefined;
+    if (config.connection) {
+      connection = this.podmanConnection.getContainerProviderConnection(config.connection);
+    } else {
+      connection = this.podmanConnection.findRunningContainerProviderConnection();
+    }
+
+    if (!connection) throw new Error('no running connection could be found');
+
+    const labels: Record<string, string> = {
+      ...config.labels,
+      [LABEL_INFERENCE_SERVER]: JSON.stringify(config.modelsInfo.map(model => model.id)),
+    };
+
+    const imageInfo = await this.pullImage(connection, config.image ?? images.vllm.default, labels);
+    // https://huggingface.co/docs/transformers/main/en/installation#offline-mode
+    // HF_HUB_OFFLINE in main
+    // TRANSFORMERS_OFFLINE for legacy
+    const envs: string[] = [`HF_HUB_CACHE=/cache`, 'TRANSFORMERS_OFFLINE=1', 'HF_HUB_OFFLINE=1'];
+
+    labels['api'] = `http://localhost:${config.port}/inference`;
+
+    const mounts: MountConfig = [
+      {
+        Target: `/cache/${modelInfo.id}`,
+        Source: modelCache,
+        Type: 'bind',
+      },
+    ];
+
+    const containerInfo = await this.createContainer(
+      imageInfo.engineId,
+      {
+        Image: imageInfo.Id,
+        Detach: true,
+        Labels: labels,
+        HostConfig: {
+          AutoRemove: false,
+          Mounts: mounts,
+          PortBindings: {
+            '8000/tcp': [
+              {
+                HostPort: `${config.port}`,
+              },
+            ],
+          },
+          SecurityOpt: [DISABLE_SELINUX_LABEL_SECURITY_OPTION],
+        },
+        HealthCheck: {
+          // must be the port INSIDE the container not the exposed one
+          Test: ['CMD-SHELL', `curl -sSf localhost:8000/version > /dev/null`],
+          Interval: SECOND * 5,
+          Retries: 4 * 5,
+        },
+        Env: envs,
+        Cmd: [
+          `--model=${joinposix('/cache', modelInfo.id, 'snapshots', commitHash)}`,
+          `--served_model_name=${modelInfo.file.file}`,
+          '--chat-template-content-format=openai',
+        ],
+      },
+      labels,
+    );
+
+    return {
+      models: [modelInfo],
+      status: 'running',
+      connection: {
+        port: config.port,
+      },
+      container: {
+        containerId: containerInfo.id,
+        engineId: containerInfo.engineId,
+      },
+      type: InferenceType.VLLM,
+      labels: labels,
+    };
+  }
+}
diff --git a/packages/frontend/src/pages/PlaygroundCreate.svelte b/packages/frontend/src/pages/PlaygroundCreate.svelte
@@ -16,7 +16,9 @@ import ModelSelect from '/@/lib/select/ModelSelect.svelte';
 import { InferenceType } from '@shared/models/IInference';
 
 let localModels: ModelInfo[];
-$: localModels = $modelsInfo.filter(model => model.file && model.backend === InferenceType.LLAMA_CPP);
+$: localModels = $modelsInfo.filter(
+  model => model.file && (model.backend === InferenceType.LLAMA_CPP || model.backend === InferenceType.VLLM),
+);
 $: availModels = $modelsInfo.filter(model => !model.file);
 let model: ModelInfo | undefined = undefined;
 let submitted: boolean = false;
diff --git a/packages/shared/src/models/IInference.ts b/packages/shared/src/models/IInference.ts
@@ -20,6 +20,7 @@ import type { ModelInfo } from './IModelInfo';
 export enum InferenceType {
   LLAMA_CPP = 'llama-cpp',
   WHISPER_CPP = 'whisper-cpp',
+  VLLM = 'vllm',
   NONE = 'none',
 }
 

Original file line number	Diff line number	Diff line change
`@@ -5,5 +5,8 @@`
`5`	`5`	`"llamacpp": {`
`6`	`6`	`"default": "quay.io/ramalama/ramalama-llama-server@sha256:cbadb36fbbc2abf9867a33e6dfe3f2df4a76774259b5d4d24d50f4fc7e525406",`
`7`	`7`	`"cuda": "quay.io/ramalama/cuda-llama-server@sha256:56efc824e5b3ae6a6a11e9537ed9e2ac05f9f9fc6f2e81a55eb67b662c94fe95"`
	`8`	`+ },`
	`9`	`+ "vllm": {`
	`10`	`+ "default": "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.8.4"`
`8`	`11`	`}`
`9`	`12`	`}`
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ import type { ModelInfo } from './IModelInfo';`
`20`	`20`	`export enum InferenceType {`
`21`	`21`	`LLAMA_CPP = 'llama-cpp',`
`22`	`22`	`WHISPER_CPP = 'whisper-cpp',`
	`23`	`+ VLLM = 'vllm',`
`23`	`24`	`NONE = 'none',`
`24`	`25`	`}`
`25`	`26`