Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions packages/backend/src/assets/ai.json
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,13 @@
"license": "Apache-2.0",
"url": "huggingface:/OpenVINO/mistral-7B-instruct-v0.2-int4-ov",
"backend": "openvino"
},
{
"id": "Qwen/Qwen2-VL-2B-Instruct",
"name": "Qwen/Qwen2-VL-2B-Instruct",
"description": "Qwen/Qwen2-VL-2B-Instruct",
"url": "huggingface:/Qwen/Qwen2-VL-2B-Instruct",
"backend": "vllm"
}
],
"categories": [
Expand Down
3 changes: 3 additions & 0 deletions packages/backend/src/assets/inference-images.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,8 @@
},
"openvino": {
"default": "quay.io/ramalama/openvino@sha256:670d91cc322933cc4263606459317cd4ca3fcfb16d59a46b11dcd498c2cd7cb5"
},
"vllm": {
"default": "quay.io/podman-ai-lab/vllm@sha256:8a2d2894835bcb560b7c76b0c0f0e8d19ef21de5ad0c9508809ba73cfe349780"
}
}
3 changes: 3 additions & 0 deletions packages/backend/src/managers/modelsManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,8 @@ export class ModelsManager implements Disposable {
model: ModelInfo,
labels?: { [key: string]: string },
): Promise<string> {
console.log('[ModelsManager] upload model', model);

// ensure the model upload is not disabled
if (this.configurationRegistry.getExtensionConfiguration().modelUploadDisabled) {
console.warn('The model upload is disabled, this may cause the inference server to take a few minutes to start.');
Expand All @@ -392,6 +394,7 @@ export class ModelsManager implements Disposable {

// perform download
const path = uploader.perform(model.id);
console.log('[ModelsManager] path got', path);
await this.updateModelInfos();

return path;
Expand Down
5 changes: 4 additions & 1 deletion packages/backend/src/managers/playgroundV2Manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import { McpServerManager } from './playground/McpServerManager';
import type { ToolSet } from 'ai';
import { simulateStreamingMiddleware, wrapLanguageModel } from 'ai';
import { toMcpClients } from '../utils/mcpUtils';
import { InferenceType } from '@shared/models/IInference';

export class PlaygroundV2Manager implements Disposable {
readonly #conversationRegistry: ConversationRegistry;
Expand Down Expand Up @@ -122,8 +123,10 @@ export class PlaygroundV2Manager implements Disposable {

// create/start inference server if necessary
const servers = this.inferenceManager.getServers();
console.log('servers', servers);
const server = servers.find(s => s.models.map(mi => mi.id).includes(model.id));
if (!server) {
console.warn(`no server running found with modelId ${model.id}, creating new one`);
await this.inferenceManager.createInferenceServer(
await withDefaultConfiguration({
modelsInfo: [model],
Expand Down Expand Up @@ -253,7 +256,7 @@ export class PlaygroundV2Manager implements Disposable {

const start = Date.now();
streamProcessor
.stream(model, tools, options)
.stream(model, tools, server.type === InferenceType.VLLM ? {} : options)
.consumeStream()
.then(() => {
this.telemetry.logUsage('playground.message.complete', {
Expand Down
4 changes: 4 additions & 0 deletions packages/backend/src/studio.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ import { LLAMA_STACK_API_CHANNEL, type LlamaStackAPI } from '@shared/LlamaStackA
import { LlamaStackManager } from './managers/llama-stack/llamaStackManager';
import { OpenVINO } from './workers/provider/OpenVINO';
import os from 'node:os';
import { VLLM } from './workers/provider/VLLM';

export class Studio {
readonly #extensionContext: ExtensionContext;
Expand Down Expand Up @@ -289,6 +290,9 @@ export class Studio {
),
);
}
this.#extensionContext.subscriptions.push(
this.#inferenceProviderRegistry.register(new VLLM(this.#taskRegistry, this.#podmanConnection)),
);

/**
* The inference manager create, stop, manage Inference servers
Expand Down
156 changes: 156 additions & 0 deletions packages/backend/src/workers/provider/VLLM.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
/**********************************************************************
* Copyright (C) 2024 Red Hat, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SPDX-License-Identifier: Apache-2.0
***********************************************************************/

import { InferenceProvider } from './InferenceProvider';
import type { TaskRegistry } from '../../registries/TaskRegistry';
import type { PodmanConnection } from '../../managers/podmanConnection';
import { type InferenceServer, InferenceType } from '@shared/models/IInference';
import type { InferenceServerConfig } from '@shared/models/InferenceServerConfig';
import type { ContainerProviderConnection, MountConfig } from '@podman-desktop/api';
import * as images from '../../assets/inference-images.json';
import { LABEL_INFERENCE_SERVER } from '../../utils/inferenceUtils';
import { DISABLE_SELINUX_LABEL_SECURITY_OPTION } from '../../utils/utils';
import { getHuggingFaceModelMountInfo } from '../../utils/modelsUtils';
import { SECOND } from './LlamaCppPython';

export class VLLM extends InferenceProvider {
constructor(
taskRegistry: TaskRegistry,
private podmanConnection: PodmanConnection,
) {
super(taskRegistry, InferenceType.VLLM, 'vllm');
}

dispose(): void {}

public enabled = (): boolean => true;

/**
* Here is an example
*
* podman run -it --rm
* -v C:\Users\axels\.cache\huggingface\hub\models--mistralai--Mistral-7B-v0.1:/cache/models--mistralai--Mistral-7B-v0.1
* -e HF_HUB_CACHE=/cache
* localhost/vllm-cpu-env:latest
* --model=/cache/models--mistralai--Mistral-7B-v0.1/snapshots/7231864981174d9bee8c7687c24c8344414eae6b
*
* @param config
*/
override async perform(config: InferenceServerConfig): Promise<InferenceServer> {
if (config.modelsInfo.length !== 1)
throw new Error(`only one model is supported, received ${config.modelsInfo.length}`);

const modelInfo = config.modelsInfo[0];
if (modelInfo.backend !== InferenceType.VLLM) {
throw new Error(`VLLM requires models with backend type ${InferenceType.VLLM} got ${modelInfo.backend}.`);
}

if (modelInfo.file === undefined) {
throw new Error('The model info file provided is undefined');
}

console.log('[VLLM]', config);
console.log('[VLLM] modelInfo.file', modelInfo.file.path);

// something ~/.cache/huggingface/hub/models--facebook--opt-125m/snapshots
// modelInfo.file.path

// get model mount settings
const mountInfo = getHuggingFaceModelMountInfo(modelInfo);
const modelCache = mountInfo.suffix ? `/cache/${mountInfo.suffix}` : '/cache';

let connection: ContainerProviderConnection | undefined;
if (config.connection) {
connection = this.podmanConnection.getContainerProviderConnection(config.connection);
} else {
connection = this.podmanConnection.findRunningContainerProviderConnection();
}

if (!connection) throw new Error('no running connection could be found');

const labels: Record<string, string> = {
...config.labels,
[LABEL_INFERENCE_SERVER]: JSON.stringify(config.modelsInfo.map(model => model.id)),
};

const imageInfo = await this.pullImage(connection, config.image ?? images.vllm.default, labels);
// https://huggingface.co/docs/transformers/main/en/installation#offline-mode
// HF_HUB_OFFLINE in main
// TRANSFORMERS_OFFLINE for legacy
const envs: string[] = [`HF_HUB_CACHE=/cache`, 'TRANSFORMERS_OFFLINE=1', 'HF_HUB_OFFLINE=1'];

labels['api'] = `http://localhost:${config.port}/v1`;

const mounts: MountConfig = [
{
Target: `/cache`,
Source: mountInfo.mount,
Type: 'bind',
},
];

const containerInfo = await this.createContainer(
imageInfo.engineId,
{
Image: imageInfo.Id,
Detach: true,
Labels: labels,
HostConfig: {
AutoRemove: false,
Mounts: mounts,
PortBindings: {
'8000/tcp': [
{
HostPort: `${config.port}`,
},
],
},
SecurityOpt: [DISABLE_SELINUX_LABEL_SECURITY_OPTION],
},
HealthCheck: {
// must be the port INSIDE the container not the exposed one
Test: ['CMD-SHELL', `curl -sSf localhost:8000/version > /dev/null`],
Interval: SECOND * 5,
Retries: 4 * 5,
},
Env: envs,
Cmd: [
`--model=${modelCache}`,
`--served_model_name=${modelInfo.name}`,
'--chat-template-content-format=openai',
'--dtype=float32',
],
},
labels,
);

return {
models: [modelInfo],
status: 'running',
connection: {
port: config.port,
},
container: {
containerId: containerInfo.id,
engineId: containerInfo.engineId,
},
type: InferenceType.VLLM,
labels: labels,
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ const generate = async (language: string, variant: string): Promise<void> => {
let options: RequestOptions | undefined;
switch (service?.type) {
case InferenceType.LLAMA_CPP:
case InferenceType.VLLM:
options = {
url: `http://localhost:${service?.connection.port || '??'}/v1/chat/completions`,
method: 'POST',
Expand Down
2 changes: 2 additions & 0 deletions packages/shared/src/models/IInference.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@ export enum InferenceType {
LLAMA_CPP = 'llama-cpp',
WHISPER_CPP = 'whisper-cpp',
OPENVINO = 'openvino',
VLLM = 'vllm',
NONE = 'none',
}

const InferenceTypeLabel = {
'llama-cpp': 'llamacpp',
'whisper-cpp': 'whispercpp',
openvino: 'openvino',
vllm: 'vLLM',
none: 'None',
};

Expand Down