containers · axel7083 · Apr 18, 2025 · May 15, 2025 · Jun 19, 2025
@@ -526,6 +526,13 @@
       "license": "Apache-2.0",
       "url": "huggingface:/OpenVINO/mistral-7B-instruct-v0.2-int4-ov",
       "backend": "openvino"
+    },
+    {
+      "id": "Qwen/Qwen2-VL-2B-Instruct",
+      "name": "Qwen/Qwen2-VL-2B-Instruct",
+      "description": "Qwen/Qwen2-VL-2B-Instruct",
+      "url": "huggingface:/Qwen/Qwen2-VL-2B-Instruct",
+      "backend": "vllm"
     }
   ],
   "categories": [

@@ -8,5 +8,8 @@
   },
   "openvino": {
     "default": "quay.io/ramalama/openvino@sha256:670d91cc322933cc4263606459317cd4ca3fcfb16d59a46b11dcd498c2cd7cb5"
+  },
+  "vllm": {
+    "default": "quay.io/podman-ai-lab/vllm@sha256:8a2d2894835bcb560b7c76b0c0f0e8d19ef21de5ad0c9508809ba73cfe349780"
   }
 }
@@ -375,6 +375,8 @@ export class ModelsManager implements Disposable {
     model: ModelInfo,
     labels?: { [key: string]: string },
   ): Promise<string> {
+    console.log('[ModelsManager] upload model', model);
+
     // ensure the model upload is not disabled
     if (this.configurationRegistry.getExtensionConfiguration().modelUploadDisabled) {
       console.warn('The model upload is disabled, this may cause the inference server to take a few minutes to start.');
@@ -392,6 +394,7 @@ export class ModelsManager implements Disposable {
 
     // perform download
     const path = uploader.perform(model.id);
+    console.log('[ModelsManager] path got', path);
     await this.updateModelInfos();
 
     return path;

@@ -34,6 +34,7 @@ import { McpServerManager } from './playground/McpServerManager';
 import type { ToolSet } from 'ai';
 import { simulateStreamingMiddleware, wrapLanguageModel } from 'ai';
 import { toMcpClients } from '../utils/mcpUtils';
+import { InferenceType } from '@shared/models/IInference';
 
 export class PlaygroundV2Manager implements Disposable {
   readonly #conversationRegistry: ConversationRegistry;
@@ -122,8 +123,10 @@ export class PlaygroundV2Manager implements Disposable {
 
     // create/start inference server if necessary
     const servers = this.inferenceManager.getServers();
+    console.log('servers', servers);
     const server = servers.find(s => s.models.map(mi => mi.id).includes(model.id));
     if (!server) {
+      console.warn(`no server running found with modelId ${model.id}, creating new one`);
       await this.inferenceManager.createInferenceServer(
         await withDefaultConfiguration({
           modelsInfo: [model],
@@ -253,7 +256,7 @@ export class PlaygroundV2Manager implements Disposable {
 
     const start = Date.now();
     streamProcessor
-      .stream(model, tools, options)
+      .stream(model, tools, server.type === InferenceType.VLLM ? {} : options)
       .consumeStream()
       .then(() => {
         this.telemetry.logUsage('playground.message.complete', {

@@ -64,6 +64,7 @@ import { LLAMA_STACK_API_CHANNEL, type LlamaStackAPI } from '@shared/LlamaStackA
 import { LlamaStackManager } from './managers/llama-stack/llamaStackManager';
 import { OpenVINO } from './workers/provider/OpenVINO';
 import os from 'node:os';
+import { VLLM } from './workers/provider/VLLM';
 
 export class Studio {
   readonly #extensionContext: ExtensionContext;
@@ -289,6 +290,9 @@ export class Studio {
         ),
       );
     }
+    this.#extensionContext.subscriptions.push(
+      this.#inferenceProviderRegistry.register(new VLLM(this.#taskRegistry, this.#podmanConnection)),
+    );
 
     /**
      * The inference manager create, stop, manage Inference servers

@@ -0,0 +1,156 @@
+/**********************************************************************
+ * Copyright (C) 2024 Red Hat, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ***********************************************************************/
+
+import { InferenceProvider } from './InferenceProvider';
+import type { TaskRegistry } from '../../registries/TaskRegistry';
+import type { PodmanConnection } from '../../managers/podmanConnection';
+import { type InferenceServer, InferenceType } from '@shared/models/IInference';
+import type { InferenceServerConfig } from '@shared/models/InferenceServerConfig';
+import type { ContainerProviderConnection, MountConfig } from '@podman-desktop/api';
+import * as images from '../../assets/inference-images.json';
+import { LABEL_INFERENCE_SERVER } from '../../utils/inferenceUtils';
+import { DISABLE_SELINUX_LABEL_SECURITY_OPTION } from '../../utils/utils';
+import { getHuggingFaceModelMountInfo } from '../../utils/modelsUtils';
+import { SECOND } from './LlamaCppPython';
+
+export class VLLM extends InferenceProvider {
+  constructor(
+    taskRegistry: TaskRegistry,
+    private podmanConnection: PodmanConnection,
+  ) {
+    super(taskRegistry, InferenceType.VLLM, 'vllm');
+  }
+
+  dispose(): void {}
+
+  public enabled = (): boolean => true;
+
+  /**
+   * Here is an example
+   *
+   * podman run -it --rm
+   *  -v C:\Users\axels\.cache\huggingface\hub\models--mistralai--Mistral-7B-v0.1:/cache/models--mistralai--Mistral-7B-v0.1
+   *  -e HF_HUB_CACHE=/cache
+   *  localhost/vllm-cpu-env:latest
+   *  --model=/cache/models--mistralai--Mistral-7B-v0.1/snapshots/7231864981174d9bee8c7687c24c8344414eae6b
+   *
+   * @param config
+   */
+  override async perform(config: InferenceServerConfig): Promise<InferenceServer> {
+    if (config.modelsInfo.length !== 1)
+      throw new Error(`only one model is supported, received ${config.modelsInfo.length}`);
+
+    const modelInfo = config.modelsInfo[0];
+    if (modelInfo.backend !== InferenceType.VLLM) {
+      throw new Error(`VLLM requires models with backend type ${InferenceType.VLLM} got ${modelInfo.backend}.`);
+    }
+
+    if (modelInfo.file === undefined) {
+      throw new Error('The model info file provided is undefined');
+    }
+
+    console.log('[VLLM]', config);
+    console.log('[VLLM] modelInfo.file', modelInfo.file.path);
+
+    // something ~/.cache/huggingface/hub/models--facebook--opt-125m/snapshots
+    // modelInfo.file.path
+
+    // get model mount settings
+    const mountInfo = getHuggingFaceModelMountInfo(modelInfo);
+    const modelCache = mountInfo.suffix ? `/cache/${mountInfo.suffix}` : '/cache';
+
+    let connection: ContainerProviderConnection | undefined;
+    if (config.connection) {
+      connection = this.podmanConnection.getContainerProviderConnection(config.connection);
+    } else {
+      connection = this.podmanConnection.findRunningContainerProviderConnection();
+    }
+
+    if (!connection) throw new Error('no running connection could be found');
+
+    const labels: Record<string, string> = {
+      ...config.labels,
+      [LABEL_INFERENCE_SERVER]: JSON.stringify(config.modelsInfo.map(model => model.id)),
+    };
+
+    const imageInfo = await this.pullImage(connection, config.image ?? images.vllm.default, labels);
+    // https://huggingface.co/docs/transformers/main/en/installation#offline-mode
+    // HF_HUB_OFFLINE in main
+    // TRANSFORMERS_OFFLINE for legacy
+    const envs: string[] = [`HF_HUB_CACHE=/cache`, 'TRANSFORMERS_OFFLINE=1', 'HF_HUB_OFFLINE=1'];
+
+    labels['api'] = `http://localhost:${config.port}/v1`;
+
+    const mounts: MountConfig = [
+      {
+        Target: `/cache`,
+        Source: mountInfo.mount,
+        Type: 'bind',
+      },
+    ];
+
+    const containerInfo = await this.createContainer(
+      imageInfo.engineId,
+      {
+        Image: imageInfo.Id,
+        Detach: true,
+        Labels: labels,
+        HostConfig: {
+          AutoRemove: false,
+          Mounts: mounts,
+          PortBindings: {
+            '8000/tcp': [
+              {
+                HostPort: `${config.port}`,
+              },
+            ],
+          },
+          SecurityOpt: [DISABLE_SELINUX_LABEL_SECURITY_OPTION],
+        },
+        HealthCheck: {
+          // must be the port INSIDE the container not the exposed one
+          Test: ['CMD-SHELL', `curl -sSf localhost:8000/version > /dev/null`],
+          Interval: SECOND * 5,
+          Retries: 4 * 5,
+        },
+        Env: envs,
+        Cmd: [
+          `--model=${modelCache}`,
+          `--served_model_name=${modelInfo.name}`,
+          '--chat-template-content-format=openai',
+          '--dtype=float32',
+        ],
+      },
+      labels,
+    );
+
+    return {
+      models: [modelInfo],
+      status: 'running',
+      connection: {
+        port: config.port,
+      },
+      container: {
+        containerId: containerInfo.id,
+        engineId: containerInfo.engineId,
+      },
+      type: InferenceType.VLLM,
+      labels: labels,
+    };
+  }
+}
@@ -56,6 +56,7 @@ const generate = async (language: string, variant: string): Promise<void> => {
   let options: RequestOptions | undefined;
   switch (service?.type) {
     case InferenceType.LLAMA_CPP:
+    case InferenceType.VLLM:
       options = {
         url: `http://localhost:${service?.connection.port || '??'}/v1/chat/completions`,
         method: 'POST',

@@ -21,13 +21,15 @@ export enum InferenceType {
   LLAMA_CPP = 'llama-cpp',
   WHISPER_CPP = 'whisper-cpp',
   OPENVINO = 'openvino',
+  VLLM = 'vllm',
   NONE = 'none',
 }
 
 const InferenceTypeLabel = {
   'llama-cpp': 'llamacpp',
   'whisper-cpp': 'whispercpp',
   openvino: 'openvino',
+  vllm: 'vLLM',
   none: 'None',
 };