diff --git a/packages/backend/src/assets/ai.json b/packages/backend/src/assets/ai.json index 4a6442fb6..eb510d977 100644 --- a/packages/backend/src/assets/ai.json +++ b/packages/backend/src/assets/ai.json @@ -526,6 +526,13 @@ "license": "Apache-2.0", "url": "huggingface:/OpenVINO/mistral-7B-instruct-v0.2-int4-ov", "backend": "openvino" + }, + { + "id": "Qwen/Qwen2-VL-2B-Instruct", + "name": "Qwen/Qwen2-VL-2B-Instruct", + "description": "Qwen/Qwen2-VL-2B-Instruct", + "url": "huggingface:/Qwen/Qwen2-VL-2B-Instruct", + "backend": "vllm" } ], "categories": [ diff --git a/packages/backend/src/assets/inference-images.json b/packages/backend/src/assets/inference-images.json index 880cbc40e..886076b98 100644 --- a/packages/backend/src/assets/inference-images.json +++ b/packages/backend/src/assets/inference-images.json @@ -8,5 +8,8 @@ }, "openvino": { "default": "quay.io/ramalama/openvino@sha256:670d91cc322933cc4263606459317cd4ca3fcfb16d59a46b11dcd498c2cd7cb5" + }, + "vllm": { + "default": "quay.io/podman-ai-lab/vllm@sha256:8a2d2894835bcb560b7c76b0c0f0e8d19ef21de5ad0c9508809ba73cfe349780" } } diff --git a/packages/backend/src/managers/modelsManager.ts b/packages/backend/src/managers/modelsManager.ts index 09a26c0cf..74465757b 100644 --- a/packages/backend/src/managers/modelsManager.ts +++ b/packages/backend/src/managers/modelsManager.ts @@ -375,6 +375,8 @@ export class ModelsManager implements Disposable { model: ModelInfo, labels?: { [key: string]: string }, ): Promise { + console.log('[ModelsManager] upload model', model); + // ensure the model upload is not disabled if (this.configurationRegistry.getExtensionConfiguration().modelUploadDisabled) { console.warn('The model upload is disabled, this may cause the inference server to take a few minutes to start.'); @@ -392,6 +394,7 @@ export class ModelsManager implements Disposable { // perform download const path = uploader.perform(model.id); + console.log('[ModelsManager] path got', path); await this.updateModelInfos(); return path; diff --git a/packages/backend/src/managers/playgroundV2Manager.ts b/packages/backend/src/managers/playgroundV2Manager.ts index b7b4a53ad..fef481269 100644 --- a/packages/backend/src/managers/playgroundV2Manager.ts +++ b/packages/backend/src/managers/playgroundV2Manager.ts @@ -34,6 +34,7 @@ import { McpServerManager } from './playground/McpServerManager'; import type { ToolSet } from 'ai'; import { simulateStreamingMiddleware, wrapLanguageModel } from 'ai'; import { toMcpClients } from '../utils/mcpUtils'; +import { InferenceType } from '@shared/models/IInference'; export class PlaygroundV2Manager implements Disposable { readonly #conversationRegistry: ConversationRegistry; @@ -122,8 +123,10 @@ export class PlaygroundV2Manager implements Disposable { // create/start inference server if necessary const servers = this.inferenceManager.getServers(); + console.log('servers', servers); const server = servers.find(s => s.models.map(mi => mi.id).includes(model.id)); if (!server) { + console.warn(`no server running found with modelId ${model.id}, creating new one`); await this.inferenceManager.createInferenceServer( await withDefaultConfiguration({ modelsInfo: [model], @@ -253,7 +256,7 @@ export class PlaygroundV2Manager implements Disposable { const start = Date.now(); streamProcessor - .stream(model, tools, options) + .stream(model, tools, server.type === InferenceType.VLLM ? {} : options) .consumeStream() .then(() => { this.telemetry.logUsage('playground.message.complete', { diff --git a/packages/backend/src/studio.ts b/packages/backend/src/studio.ts index 811c1eeaa..dd6b4ef62 100644 --- a/packages/backend/src/studio.ts +++ b/packages/backend/src/studio.ts @@ -64,6 +64,7 @@ import { LLAMA_STACK_API_CHANNEL, type LlamaStackAPI } from '@shared/LlamaStackA import { LlamaStackManager } from './managers/llama-stack/llamaStackManager'; import { OpenVINO } from './workers/provider/OpenVINO'; import os from 'node:os'; +import { VLLM } from './workers/provider/VLLM'; export class Studio { readonly #extensionContext: ExtensionContext; @@ -289,6 +290,9 @@ export class Studio { ), ); } + this.#extensionContext.subscriptions.push( + this.#inferenceProviderRegistry.register(new VLLM(this.#taskRegistry, this.#podmanConnection)), + ); /** * The inference manager create, stop, manage Inference servers diff --git a/packages/backend/src/workers/provider/VLLM.ts b/packages/backend/src/workers/provider/VLLM.ts new file mode 100644 index 000000000..e413aa90e --- /dev/null +++ b/packages/backend/src/workers/provider/VLLM.ts @@ -0,0 +1,156 @@ +/********************************************************************** + * Copyright (C) 2024 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ***********************************************************************/ + +import { InferenceProvider } from './InferenceProvider'; +import type { TaskRegistry } from '../../registries/TaskRegistry'; +import type { PodmanConnection } from '../../managers/podmanConnection'; +import { type InferenceServer, InferenceType } from '@shared/models/IInference'; +import type { InferenceServerConfig } from '@shared/models/InferenceServerConfig'; +import type { ContainerProviderConnection, MountConfig } from '@podman-desktop/api'; +import * as images from '../../assets/inference-images.json'; +import { LABEL_INFERENCE_SERVER } from '../../utils/inferenceUtils'; +import { DISABLE_SELINUX_LABEL_SECURITY_OPTION } from '../../utils/utils'; +import { getHuggingFaceModelMountInfo } from '../../utils/modelsUtils'; +import { SECOND } from './LlamaCppPython'; + +export class VLLM extends InferenceProvider { + constructor( + taskRegistry: TaskRegistry, + private podmanConnection: PodmanConnection, + ) { + super(taskRegistry, InferenceType.VLLM, 'vllm'); + } + + dispose(): void {} + + public enabled = (): boolean => true; + + /** + * Here is an example + * + * podman run -it --rm + * -v C:\Users\axels\.cache\huggingface\hub\models--mistralai--Mistral-7B-v0.1:/cache/models--mistralai--Mistral-7B-v0.1 + * -e HF_HUB_CACHE=/cache + * localhost/vllm-cpu-env:latest + * --model=/cache/models--mistralai--Mistral-7B-v0.1/snapshots/7231864981174d9bee8c7687c24c8344414eae6b + * + * @param config + */ + override async perform(config: InferenceServerConfig): Promise { + if (config.modelsInfo.length !== 1) + throw new Error(`only one model is supported, received ${config.modelsInfo.length}`); + + const modelInfo = config.modelsInfo[0]; + if (modelInfo.backend !== InferenceType.VLLM) { + throw new Error(`VLLM requires models with backend type ${InferenceType.VLLM} got ${modelInfo.backend}.`); + } + + if (modelInfo.file === undefined) { + throw new Error('The model info file provided is undefined'); + } + + console.log('[VLLM]', config); + console.log('[VLLM] modelInfo.file', modelInfo.file.path); + + // something ~/.cache/huggingface/hub/models--facebook--opt-125m/snapshots + // modelInfo.file.path + + // get model mount settings + const mountInfo = getHuggingFaceModelMountInfo(modelInfo); + const modelCache = mountInfo.suffix ? `/cache/${mountInfo.suffix}` : '/cache'; + + let connection: ContainerProviderConnection | undefined; + if (config.connection) { + connection = this.podmanConnection.getContainerProviderConnection(config.connection); + } else { + connection = this.podmanConnection.findRunningContainerProviderConnection(); + } + + if (!connection) throw new Error('no running connection could be found'); + + const labels: Record = { + ...config.labels, + [LABEL_INFERENCE_SERVER]: JSON.stringify(config.modelsInfo.map(model => model.id)), + }; + + const imageInfo = await this.pullImage(connection, config.image ?? images.vllm.default, labels); + // https://huggingface.co/docs/transformers/main/en/installation#offline-mode + // HF_HUB_OFFLINE in main + // TRANSFORMERS_OFFLINE for legacy + const envs: string[] = [`HF_HUB_CACHE=/cache`, 'TRANSFORMERS_OFFLINE=1', 'HF_HUB_OFFLINE=1']; + + labels['api'] = `http://localhost:${config.port}/v1`; + + const mounts: MountConfig = [ + { + Target: `/cache`, + Source: mountInfo.mount, + Type: 'bind', + }, + ]; + + const containerInfo = await this.createContainer( + imageInfo.engineId, + { + Image: imageInfo.Id, + Detach: true, + Labels: labels, + HostConfig: { + AutoRemove: false, + Mounts: mounts, + PortBindings: { + '8000/tcp': [ + { + HostPort: `${config.port}`, + }, + ], + }, + SecurityOpt: [DISABLE_SELINUX_LABEL_SECURITY_OPTION], + }, + HealthCheck: { + // must be the port INSIDE the container not the exposed one + Test: ['CMD-SHELL', `curl -sSf localhost:8000/version > /dev/null`], + Interval: SECOND * 5, + Retries: 4 * 5, + }, + Env: envs, + Cmd: [ + `--model=${modelCache}`, + `--served_model_name=${modelInfo.name}`, + '--chat-template-content-format=openai', + '--dtype=float32', + ], + }, + labels, + ); + + return { + models: [modelInfo], + status: 'running', + connection: { + port: config.port, + }, + container: { + containerId: containerInfo.id, + engineId: containerInfo.engineId, + }, + type: InferenceType.VLLM, + labels: labels, + }; + } +} diff --git a/packages/frontend/src/pages/InferenceServerDetails.svelte b/packages/frontend/src/pages/InferenceServerDetails.svelte index 0ea8b1d88..8e89db501 100644 --- a/packages/frontend/src/pages/InferenceServerDetails.svelte +++ b/packages/frontend/src/pages/InferenceServerDetails.svelte @@ -56,6 +56,7 @@ const generate = async (language: string, variant: string): Promise => { let options: RequestOptions | undefined; switch (service?.type) { case InferenceType.LLAMA_CPP: + case InferenceType.VLLM: options = { url: `http://localhost:${service?.connection.port || '??'}/v1/chat/completions`, method: 'POST', diff --git a/packages/shared/src/models/IInference.ts b/packages/shared/src/models/IInference.ts index 50a0bf5e0..1f9e4e09b 100644 --- a/packages/shared/src/models/IInference.ts +++ b/packages/shared/src/models/IInference.ts @@ -21,6 +21,7 @@ export enum InferenceType { LLAMA_CPP = 'llama-cpp', WHISPER_CPP = 'whisper-cpp', OPENVINO = 'openvino', + VLLM = 'vllm', NONE = 'none', } @@ -28,6 +29,7 @@ const InferenceTypeLabel = { 'llama-cpp': 'llamacpp', 'whisper-cpp': 'whispercpp', openvino: 'openvino', + vllm: 'vLLM', none: 'None', };