diff --git a/package.json b/package.json index ae247ee..1767626 100644 --- a/package.json +++ b/package.json @@ -83,37 +83,6 @@ }, "inference.model": { "type": "string", - "enum": [ - "stable-code:3b-code-q4_0", - "codellama:7b-code-q4_K_S", - "codellama:7b-code-q4_K_M", - "codellama:7b-code-q6_K", - "codellama:7b-code-fp16", - "codellama:13b-code-q4_K_S", - "codellama:13b-code-q4_K_M", - "codellama:13b-code-q6_K", - "codellama:13b-code-fp16", - "codellama:34b-code-q4_K_S", - "codellama:34b-code-q4_K_M", - "codellama:34b-code-q6_K", - "codellama:70b-code-q4_K_S", - "codellama:70b-code-q4_K_M", - "codellama:70b-code-q6_K", - "codellama:70b-code-fp16", - "deepseek-coder:1.3b-base-q4_0", - "deepseek-coder:1.3b-base-q4_1", - "deepseek-coder:1.3b-base-q8_0", - "deepseek-coder:6.7b-base-q4_K_S", - "deepseek-coder:6.7b-base-q4_K_M", - "deepseek-coder:6.7b-base-q5_K_S", - "deepseek-coder:6.7b-base-q5_K_M", - "deepseek-coder:6.7b-base-q8_0", - "deepseek-coder:6.7b-base-fp16", - "deepseek-coder:33b-base-q4_K_S", - "deepseek-coder:33b-base-q4_K_M", - "deepseek-coder:33b-base-fp16", - "custom" - ], "default": "stable-code:3b-code-q4_0", "description": "Inference model to use", "order": 2 @@ -124,23 +93,6 @@ "description": "Temperature of the model. Increasing the temperature will make the model answer more creatively.", "order": 3 }, - "inference.custom.model": { - "type": "string", - "default": "", - "description": "Custom model name", - "order": 4 - }, - "inference.custom.format": { - "type": "string", - "enum": [ - "stable-code", - "codellama", - "deepseek" - ], - "default": "stable-code", - "description": "Custom model prompt format", - "order": 5 - }, "inference.maxLines": { "type": "number", "default": 16, diff --git a/src/config.ts b/src/config.ts index a6294b1..0cdafd8 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,5 +1,4 @@ import vscode from 'vscode'; -import { ModelFormat } from './prompts/processors/models'; class Config { @@ -24,17 +23,6 @@ class Config { // Load model let modelName = config.get('model') as string; - let modelFormat: ModelFormat = 'codellama'; - if (modelName === 'custom') { - modelName = config.get('custom.model') as string; - modelFormat = config.get('cutom.format') as ModelFormat; - } else { - if (modelName.startsWith('deepseek-coder')) { - modelFormat = 'deepseek'; - } else if (modelName.startsWith('stable-code')) { - modelFormat = 'stable-code'; - } - } let delay = config.get('delay') as number; @@ -45,7 +33,6 @@ class Config { maxTokens, temperature, modelName, - modelFormat, delay }; } diff --git a/src/modules/lineGenerator.ts b/src/modules/lineGenerator.ts deleted file mode 100644 index f20081f..0000000 --- a/src/modules/lineGenerator.ts +++ /dev/null @@ -1,54 +0,0 @@ -export async function* lineGenerator(url: string, data: any, bearerToken: string): AsyncGenerator { - // Request - const controller = new AbortController(); - let res = await fetch(url, { - method: 'POST', - body: JSON.stringify(data), - headers: bearerToken ? { - 'Content-Type': 'application/json', - Authorization: `Bearer ${bearerToken}`, - } : { - 'Content-Type': 'application/json', - }, - signal: controller.signal, - }); - if (!res.ok || !res.body) { - throw Error('Unable to connect to backend'); - } - - // Reading stream - let stream = res.body.getReader(); - const decoder = new TextDecoder(); - let pending: string = ''; - try { - while (true) { - const { done, value } = await stream.read(); - - // If ended - if (done) { - if (pending.length > 0) { // New lines are impossible here - yield pending; - } - break; - } - - // Append chunk - let chunk = decoder.decode(value); - console.warn(chunk); - pending += chunk; - - // Yield results - while (pending.indexOf('\n') >= 0) { - let offset = pending.indexOf('\n'); - yield pending.slice(0, offset); - pending = pending.slice(offset + 1); - } - } - } finally { - stream.releaseLock(); - if (!stream.closed) { // Stop generation - await stream.cancel(); - } - controller.abort(); - } -} \ No newline at end of file diff --git a/src/modules/ollamaCheckModel.ts b/src/modules/ollamaCheckModel.ts deleted file mode 100644 index 5aedb8f..0000000 --- a/src/modules/ollamaCheckModel.ts +++ /dev/null @@ -1,21 +0,0 @@ -import { info } from "./log"; - -export async function ollamaCheckModel(endpoint: string, model: string, bearerToken: string) { - // Check if exists - let res = await fetch(endpoint + '/api/tags', { - headers: bearerToken ? { - Authorization: `Bearer ${bearerToken}`, - } : {}, - }); - if (!res.ok) { - info(await res.text()); - info(endpoint + '/api/tags'); - throw Error('Network response was not ok.'); - } - let body = await res.json() as { models: { name: string }[] }; - if (body.models.find((v) => v.name === model)) { - return true; - } else { - return false; - } -} \ No newline at end of file diff --git a/src/modules/ollamaDownloadModel.ts b/src/modules/ollamaDownloadModel.ts deleted file mode 100644 index 7e6eccb..0000000 --- a/src/modules/ollamaDownloadModel.ts +++ /dev/null @@ -1,9 +0,0 @@ -import { lineGenerator } from "./lineGenerator"; -import { info } from "./log"; - -export async function ollamaDownloadModel(endpoint: string, model: string, bearerToken: string) { - info('Downloading model from ollama: ' + model); - for await (let line of lineGenerator(endpoint + '/api/pull', { name: model }, bearerToken)) { - info('[DOWNLOAD] ' + line); - } -} \ No newline at end of file diff --git a/src/modules/ollamaRequest.ts b/src/modules/ollamaRequest.ts new file mode 100644 index 0000000..cbf8d93 --- /dev/null +++ b/src/modules/ollamaRequest.ts @@ -0,0 +1,35 @@ +export async function makeOllamaRequest(url: string, data: any, bearerToken: string): Promise { + // Request + const controller = new AbortController(); + let res = await fetch(url, { + method: 'POST', + body: JSON.stringify(data), + headers: bearerToken ? { + 'Content-Type': 'application/json', + Authorization: `Bearer ${bearerToken}`, + } : { + 'Content-Type': 'application/json', + }, + signal: controller.signal, + }); + if (!res.ok || !res.body) { + throw Error('Unable to connect to backend'); + } + + // Reading stream + let stream = res.body.getReader(); + const decoder = new TextDecoder(); + try { + const { value } = await stream.read(); + + // Append chunk + let chunk = decoder.decode(value); + return chunk; + } finally { + stream.releaseLock(); + if (!stream.closed) { // Stop generation + await stream.cancel(); + } + controller.abort(); + } +} \ No newline at end of file diff --git a/src/modules/ollamaTokenGenerator.ts b/src/modules/ollamaTokenGenerator.ts deleted file mode 100644 index f57fef9..0000000 --- a/src/modules/ollamaTokenGenerator.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { lineGenerator } from "./lineGenerator"; -import { info } from "./log"; - -export type OllamaToken = { - model: string, - response: string, - done: boolean -}; - -export async function* ollamaTokenGenerator(url: string, data: any, bearerToken: string): AsyncGenerator { - for await (let line of lineGenerator(url, data, bearerToken)) { - info('Receive line: ' + line); - let parsed: OllamaToken; - try { - parsed = JSON.parse(line) as OllamaToken; - } catch (e) { - console.warn('Receive wrong line: ' + line); - continue; - } - yield parsed; - } -} \ No newline at end of file diff --git a/src/prompts/autocomplete.ts b/src/prompts/autocomplete.ts index b6a8089..3415d17 100644 --- a/src/prompts/autocomplete.ts +++ b/src/prompts/autocomplete.ts @@ -1,13 +1,14 @@ -import { ollamaTokenGenerator } from '../modules/ollamaTokenGenerator'; -import { countSymbol } from '../modules/text'; -import { info } from '../modules/log'; -import { ModelFormat, adaptPrompt } from './processors/models'; +import { makeOllamaRequest } from "../modules/ollamaRequest"; + +type OllamaToken = { + model: string, + response: string, +}; export async function autocomplete(args: { endpoint: string, bearerToken: string, model: string, - format: ModelFormat, prefix: string, suffix: string, maxLines: number, @@ -16,88 +17,33 @@ export async function autocomplete(args: { canceled?: () => boolean, }): Promise { - let prompt = adaptPrompt({ prefix: args.prefix, suffix: args.suffix, format: args.format }); - // Calculate arguments let data = { model: args.model, - prompt: prompt.prompt, + prompt: args.prefix, + suffix: args.suffix, raw: true, + stream: false, options: { - stop: prompt.stop, num_predict: args.maxTokens, temperature: args.temperature } }; - // Receiving tokens - let res = ''; - let totalLines = 1; - let blockStack: ('[' | '(' | '{')[] = []; - outer: for await (let tokens of ollamaTokenGenerator(args.endpoint + '/api/generate', data, args.bearerToken)) { + const res = await makeOllamaRequest(args.endpoint + '/api/generate', data, args.bearerToken); + try { + const tokens = JSON.parse(res) as OllamaToken; if (args.canceled && args.canceled()) { - break; - } - - // Block stack - for (let c of tokens.response) { - - // Open block - if (c === '[') { - blockStack.push('['); - } else if (c === '(') { - blockStack.push('('); - } - if (c === '{') { - blockStack.push('{'); - } - - // Close block - if (c === ']') { - if (blockStack.length > 0 && blockStack[blockStack.length - 1] === '[') { - blockStack.pop(); - } else { - info('Block stack error, breaking.'); - break outer; - } - } - if (c === ')') { - if (blockStack.length > 0 && blockStack[blockStack.length - 1] === '(') { - blockStack.pop(); - } else { - info('Block stack error, breaking.'); - break outer; - } - } - if (c === '}') { - if (blockStack.length > 0 && blockStack[blockStack.length - 1] === '{') { - blockStack.pop(); - } else { - info('Block stack error, breaking.'); - break outer; - } - } - - // Append charater - res += c; - } - - // Update total lines - totalLines += countSymbol(tokens.response, '\n'); - // Break if too many lines and on top level - if (totalLines > args.maxLines && blockStack.length === 0) { - info('Too many lines, breaking.'); - break; + return ""; } + const response = tokens.response; + + // take only args.maLines lines from the response + let lines = response.split('\n'); + lines = lines.slice(0, args.maxLines); + return lines.join('\n'); + } catch (e) { + console.warn('Receive wrong line: ' + res); + return ""; } - - // Remove - if (res.endsWith('')) { - res = res.slice(0, res.length - 5); - } - - // Trim ends of all lines since sometimes the AI completion will add extra spaces - res = res.split('\n').map((v) => v.trimEnd()).join('\n'); - - return res; } \ No newline at end of file diff --git a/src/prompts/processors/models.ts b/src/prompts/processors/models.ts deleted file mode 100644 index 058905f..0000000 --- a/src/prompts/processors/models.ts +++ /dev/null @@ -1,34 +0,0 @@ -export type ModelFormat = 'codellama' | 'deepseek' | 'stable-code'; - -export function adaptPrompt(args: { format: ModelFormat, prefix: string, suffix: string }): { prompt: string, stop: string[] } { - - // Common non FIM mode - // if (!args.suffix) { - // return { - // prompt: args.prefix, - // stop: [``] - // }; - // } - - // Starcoder FIM - if (args.format === 'deepseek') { - return { - prompt: `<|fim▁begin|>${args.prefix}<|fim▁hole|>${args.suffix}<|fim▁end|>`, - stop: [`<|fim▁begin|>`, `<|fim▁hole|>`, `<|fim▁end|>`, ``] - }; - } - - // Stable code FIM - if (args.format === 'stable-code') { - return { - prompt: `${args.prefix}${args.suffix}`, - stop: [`<|endoftext|>`] - }; - } - - // Codellama FIM - return { - prompt: `
 ${args.prefix}  ${args.suffix} `,
-        stop: [``, ``, ``]
-    };
-}
\ No newline at end of file
diff --git a/src/prompts/promptCache.ts b/src/prompts/promptCache.ts
deleted file mode 100644
index d4b4d73..0000000
--- a/src/prompts/promptCache.ts
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// Remove all newlines, double spaces, etc
-function normalizeText(src: string) {
-    src = src.split('\n').join(' ');
-    src = src.replace(/\s+/gm, ' ');
-    return src;
-}
-
-function extractPromptCacheKey(args: { prefix: string, suffix: string | null }) {
-    if (args.suffix) {
-        return normalizeText(args.prefix + ' ##CURSOR## ' + args.suffix);
-    } else {
-        return normalizeText(args.prefix);
-    }
-}
-
-// TODO: make it LRU
-let cache: { [key: string]: string | null } = {};
-
-export function getFromPromptCache(args: { prefix: string, suffix: string | null }): string | undefined | null {
-    const key = extractPromptCacheKey(args);
-    return cache[key];
-}
-
-export function setPromptToCache(args: { prefix: string, suffix: string | null, value: string | null }) {
-    const key = extractPromptCacheKey(args);
-    cache[key] = args.value;
-}
\ No newline at end of file
diff --git a/src/prompts/provider.ts b/src/prompts/provider.ts
index ed4be76..4d9c655 100644
--- a/src/prompts/provider.ts
+++ b/src/prompts/provider.ts
@@ -3,10 +3,7 @@ import { info, warn } from '../modules/log';
 import { autocomplete } from './autocomplete';
 import { preparePrompt } from './preparePrompt';
 import { AsyncLock } from '../modules/lock';
-import { getFromPromptCache, setPromptToCache } from './promptCache';
 import { isNotNeeded, isSupported } from './filter';
-import { ollamaCheckModel } from '../modules/ollamaCheckModel';
-import { ollamaDownloadModel } from '../modules/ollamaDownloadModel';
 import { config } from '../config';
 
 type Status = {
@@ -105,85 +102,33 @@ export class PromptProvider implements vscode.InlineCompletionItemProvider {
                 // Result
                 let res: string | null = null;
 
-                // Check if in cache
-                let cached = getFromPromptCache({
-                    prefix: prepared.prefix,
-                    suffix: prepared.suffix
-                });
-
-                // If not cached
-                if (cached === undefined) {
-
-                    // Config
-                    let inferenceConfig = config.inference;
-
-                    // Update status
-                    this.update('sync~spin', 'Llama Coder');
-                    try {
-
-                        // Check model exists
-                        let modelExists = await ollamaCheckModel(inferenceConfig.endpoint, inferenceConfig.modelName, inferenceConfig.bearerToken);
-                        if (token.isCancellationRequested) {
-                            info(`Canceled after AI completion.`);
-                            return;
-                        }
-
-                        // Download model if not exists
-                        if (!modelExists) {
-
-                            // Check if user asked to ignore download
-                            if (this.context.globalState.get('llama-coder-download-ignored') === inferenceConfig.modelName) {
-                                info(`Ingoring since user asked to ignore download.`);
-                                return;
-                            }
-
-                            // Ask for download
-                            let download = await vscode.window.showInformationMessage(`Model ${inferenceConfig.modelName} is not downloaded. Do you want to download it? Answering "No" would require you to manually download model.`, 'Yes', 'No');
-                            if (download === 'No') {
-                                info(`Ingoring since user asked to ignore download.`);
-                                this.context.globalState.update('llama-coder-download-ignored', inferenceConfig.modelName);
-                                return;
-                            }
-
-                            // Perform download
-                            this.update('sync~spin', 'Downloading');
-                            await ollamaDownloadModel(inferenceConfig.endpoint, inferenceConfig.modelName, inferenceConfig.bearerToken);
-                            this.update('sync~spin', 'Llama Coder')
-                        }
-                        if (token.isCancellationRequested) {
-                            info(`Canceled after AI completion.`);
-                            return;
-                        }
-
-                        // Run AI completion
-                        info(`Running AI completion...`);
-                        res = await autocomplete({
-                            prefix: prepared.prefix,
-                            suffix: prepared.suffix,
-                            endpoint: inferenceConfig.endpoint,
-                            bearerToken: inferenceConfig.bearerToken,
-                            model: inferenceConfig.modelName,
-                            format: inferenceConfig.modelFormat,
-                            maxLines: inferenceConfig.maxLines,
-                            maxTokens: inferenceConfig.maxTokens,
-                            temperature: inferenceConfig.temperature,
-                            canceled: () => token.isCancellationRequested,
-                        });
-                        info(`AI completion completed: ${res}`);
-
-                        // Put to cache
-                        setPromptToCache({
-                            prefix: prepared.prefix,
-                            suffix: prepared.suffix,
-                            value: res
-                        });
-                    } finally {
-                        this.update('chip', 'Llama Coder');
-                    }
-                } else {
-                    if (cached !== null) {
-                        res = cached;
+                // Config
+                let inferenceConfig = config.inference;
+
+                // Update status
+                this.update('sync~spin', 'Llama Coder');
+                try {
+                    if (token.isCancellationRequested) {
+                        info(`Canceled after AI completion.`);
+                        return;
                     }
+
+                    // Run AI completion
+                    info(`Running AI completion...`);
+                    res = await autocomplete({
+                        prefix: prepared.prefix,
+                        suffix: prepared.suffix,
+                        endpoint: inferenceConfig.endpoint,
+                        bearerToken: inferenceConfig.bearerToken,
+                        model: inferenceConfig.modelName,
+                        maxLines: inferenceConfig.maxLines,
+                        maxTokens: inferenceConfig.maxTokens,
+                        temperature: inferenceConfig.temperature,
+                        canceled: () => token.isCancellationRequested,
+                    });
+                    info(`AI completion completed: ${res}`);
+                } finally {
+                    this.update('chip', 'Llama Coder');
                 }
                 if (token.isCancellationRequested) {
                     info(`Canceled after AI completion.`);