From b465ea2c710ae3da53d6929f925370fbb2402434 Mon Sep 17 00:00:00 2001 From: Yousshim Date: Sun, 30 Mar 2025 17:45:37 +0200 Subject: [PATCH 1/6] feat: delegate the prompt formatting to ollama --- package.json | 48 -------------------------------- src/config.ts | 13 --------- src/prompts/autocomplete.ts | 8 ++---- src/prompts/processors/models.ts | 34 ---------------------- src/prompts/provider.ts | 1 - 5 files changed, 2 insertions(+), 102 deletions(-) delete mode 100644 src/prompts/processors/models.ts diff --git a/package.json b/package.json index ae247ee..1767626 100644 --- a/package.json +++ b/package.json @@ -83,37 +83,6 @@ }, "inference.model": { "type": "string", - "enum": [ - "stable-code:3b-code-q4_0", - "codellama:7b-code-q4_K_S", - "codellama:7b-code-q4_K_M", - "codellama:7b-code-q6_K", - "codellama:7b-code-fp16", - "codellama:13b-code-q4_K_S", - "codellama:13b-code-q4_K_M", - "codellama:13b-code-q6_K", - "codellama:13b-code-fp16", - "codellama:34b-code-q4_K_S", - "codellama:34b-code-q4_K_M", - "codellama:34b-code-q6_K", - "codellama:70b-code-q4_K_S", - "codellama:70b-code-q4_K_M", - "codellama:70b-code-q6_K", - "codellama:70b-code-fp16", - "deepseek-coder:1.3b-base-q4_0", - "deepseek-coder:1.3b-base-q4_1", - "deepseek-coder:1.3b-base-q8_0", - "deepseek-coder:6.7b-base-q4_K_S", - "deepseek-coder:6.7b-base-q4_K_M", - "deepseek-coder:6.7b-base-q5_K_S", - "deepseek-coder:6.7b-base-q5_K_M", - "deepseek-coder:6.7b-base-q8_0", - "deepseek-coder:6.7b-base-fp16", - "deepseek-coder:33b-base-q4_K_S", - "deepseek-coder:33b-base-q4_K_M", - "deepseek-coder:33b-base-fp16", - "custom" - ], "default": "stable-code:3b-code-q4_0", "description": "Inference model to use", "order": 2 @@ -124,23 +93,6 @@ "description": "Temperature of the model. Increasing the temperature will make the model answer more creatively.", "order": 3 }, - "inference.custom.model": { - "type": "string", - "default": "", - "description": "Custom model name", - "order": 4 - }, - "inference.custom.format": { - "type": "string", - "enum": [ - "stable-code", - "codellama", - "deepseek" - ], - "default": "stable-code", - "description": "Custom model prompt format", - "order": 5 - }, "inference.maxLines": { "type": "number", "default": 16, diff --git a/src/config.ts b/src/config.ts index a6294b1..0cdafd8 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,5 +1,4 @@ import vscode from 'vscode'; -import { ModelFormat } from './prompts/processors/models'; class Config { @@ -24,17 +23,6 @@ class Config { // Load model let modelName = config.get('model') as string; - let modelFormat: ModelFormat = 'codellama'; - if (modelName === 'custom') { - modelName = config.get('custom.model') as string; - modelFormat = config.get('cutom.format') as ModelFormat; - } else { - if (modelName.startsWith('deepseek-coder')) { - modelFormat = 'deepseek'; - } else if (modelName.startsWith('stable-code')) { - modelFormat = 'stable-code'; - } - } let delay = config.get('delay') as number; @@ -45,7 +33,6 @@ class Config { maxTokens, temperature, modelName, - modelFormat, delay }; } diff --git a/src/prompts/autocomplete.ts b/src/prompts/autocomplete.ts index b6a8089..8925890 100644 --- a/src/prompts/autocomplete.ts +++ b/src/prompts/autocomplete.ts @@ -1,13 +1,11 @@ import { ollamaTokenGenerator } from '../modules/ollamaTokenGenerator'; import { countSymbol } from '../modules/text'; import { info } from '../modules/log'; -import { ModelFormat, adaptPrompt } from './processors/models'; export async function autocomplete(args: { endpoint: string, bearerToken: string, model: string, - format: ModelFormat, prefix: string, suffix: string, maxLines: number, @@ -16,15 +14,13 @@ export async function autocomplete(args: { canceled?: () => boolean, }): Promise { - let prompt = adaptPrompt({ prefix: args.prefix, suffix: args.suffix, format: args.format }); - // Calculate arguments let data = { model: args.model, - prompt: prompt.prompt, + prompt: args.prefix, + suffix: args.suffix, raw: true, options: { - stop: prompt.stop, num_predict: args.maxTokens, temperature: args.temperature } diff --git a/src/prompts/processors/models.ts b/src/prompts/processors/models.ts deleted file mode 100644 index 058905f..0000000 --- a/src/prompts/processors/models.ts +++ /dev/null @@ -1,34 +0,0 @@ -export type ModelFormat = 'codellama' | 'deepseek' | 'stable-code'; - -export function adaptPrompt(args: { format: ModelFormat, prefix: string, suffix: string }): { prompt: string, stop: string[] } { - - // Common non FIM mode - // if (!args.suffix) { - // return { - // prompt: args.prefix, - // stop: [``] - // }; - // } - - // Starcoder FIM - if (args.format === 'deepseek') { - return { - prompt: `<|fim▁begin|>${args.prefix}<|fim▁hole|>${args.suffix}<|fim▁end|>`, - stop: [`<|fim▁begin|>`, `<|fim▁hole|>`, `<|fim▁end|>`, ``] - }; - } - - // Stable code FIM - if (args.format === 'stable-code') { - return { - prompt: `${args.prefix}${args.suffix}`, - stop: [`<|endoftext|>`] - }; - } - - // Codellama FIM - return { - prompt: `
 ${args.prefix}  ${args.suffix} `,
-        stop: [``, ``, ``]
-    };
-}
\ No newline at end of file
diff --git a/src/prompts/provider.ts b/src/prompts/provider.ts
index ed4be76..5be15a3 100644
--- a/src/prompts/provider.ts
+++ b/src/prompts/provider.ts
@@ -163,7 +163,6 @@ export class PromptProvider implements vscode.InlineCompletionItemProvider {
                             endpoint: inferenceConfig.endpoint,
                             bearerToken: inferenceConfig.bearerToken,
                             model: inferenceConfig.modelName,
-                            format: inferenceConfig.modelFormat,
                             maxLines: inferenceConfig.maxLines,
                             maxTokens: inferenceConfig.maxTokens,
                             temperature: inferenceConfig.temperature,

From a9494f09fc32f83f474ef6e9be2076c8539a2ba4 Mon Sep 17 00:00:00 2001
From: Yousshim 
Date: Mon, 31 Mar 2025 00:13:40 +0200
Subject: [PATCH 2/6] refactor: add missing simicolon

---
 src/prompts/provider.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/prompts/provider.ts b/src/prompts/provider.ts
index 5be15a3..fc5b013 100644
--- a/src/prompts/provider.ts
+++ b/src/prompts/provider.ts
@@ -148,7 +148,7 @@ export class PromptProvider implements vscode.InlineCompletionItemProvider {
                             // Perform download
                             this.update('sync~spin', 'Downloading');
                             await ollamaDownloadModel(inferenceConfig.endpoint, inferenceConfig.modelName, inferenceConfig.bearerToken);
-                            this.update('sync~spin', 'Llama Coder')
+                            this.update('sync~spin', 'Llama Coder');
                         }
                         if (token.isCancellationRequested) {
                             info(`Canceled after AI completion.`);

From 99b2bd9bbeb1b6358f51b117cd88abb73e093b13 Mon Sep 17 00:00:00 2001
From: Yousshim 
Date: Tue, 1 Apr 2025 15:38:33 +0200
Subject: [PATCH 3/6] Remove prompt cache

---
 src/prompts/promptCache.ts | 28 ------------
 src/prompts/provider.ts    | 90 ++++++++++++++------------------------
 2 files changed, 34 insertions(+), 84 deletions(-)
 delete mode 100644 src/prompts/promptCache.ts

diff --git a/src/prompts/promptCache.ts b/src/prompts/promptCache.ts
deleted file mode 100644
index d4b4d73..0000000
--- a/src/prompts/promptCache.ts
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// Remove all newlines, double spaces, etc
-function normalizeText(src: string) {
-    src = src.split('\n').join(' ');
-    src = src.replace(/\s+/gm, ' ');
-    return src;
-}
-
-function extractPromptCacheKey(args: { prefix: string, suffix: string | null }) {
-    if (args.suffix) {
-        return normalizeText(args.prefix + ' ##CURSOR## ' + args.suffix);
-    } else {
-        return normalizeText(args.prefix);
-    }
-}
-
-// TODO: make it LRU
-let cache: { [key: string]: string | null } = {};
-
-export function getFromPromptCache(args: { prefix: string, suffix: string | null }): string | undefined | null {
-    const key = extractPromptCacheKey(args);
-    return cache[key];
-}
-
-export function setPromptToCache(args: { prefix: string, suffix: string | null, value: string | null }) {
-    const key = extractPromptCacheKey(args);
-    cache[key] = args.value;
-}
\ No newline at end of file
diff --git a/src/prompts/provider.ts b/src/prompts/provider.ts
index fc5b013..304db34 100644
--- a/src/prompts/provider.ts
+++ b/src/prompts/provider.ts
@@ -3,7 +3,6 @@ import { info, warn } from '../modules/log';
 import { autocomplete } from './autocomplete';
 import { preparePrompt } from './preparePrompt';
 import { AsyncLock } from '../modules/lock';
-import { getFromPromptCache, setPromptToCache } from './promptCache';
 import { isNotNeeded, isSupported } from './filter';
 import { ollamaCheckModel } from '../modules/ollamaCheckModel';
 import { ollamaDownloadModel } from '../modules/ollamaDownloadModel';
@@ -105,15 +104,6 @@ export class PromptProvider implements vscode.InlineCompletionItemProvider {
                 // Result
                 let res: string | null = null;
 
-                // Check if in cache
-                let cached = getFromPromptCache({
-                    prefix: prepared.prefix,
-                    suffix: prepared.suffix
-                });
-
-                // If not cached
-                if (cached === undefined) {
-
                     // Config
                     let inferenceConfig = config.inference;
 
@@ -131,59 +121,47 @@ export class PromptProvider implements vscode.InlineCompletionItemProvider {
                         // Download model if not exists
                         if (!modelExists) {
 
-                            // Check if user asked to ignore download
-                            if (this.context.globalState.get('llama-coder-download-ignored') === inferenceConfig.modelName) {
-                                info(`Ingoring since user asked to ignore download.`);
-                                return;
-                            }
-
-                            // Ask for download
-                            let download = await vscode.window.showInformationMessage(`Model ${inferenceConfig.modelName} is not downloaded. Do you want to download it? Answering "No" would require you to manually download model.`, 'Yes', 'No');
-                            if (download === 'No') {
-                                info(`Ingoring since user asked to ignore download.`);
-                                this.context.globalState.update('llama-coder-download-ignored', inferenceConfig.modelName);
-                                return;
-                            }
-
-                            // Perform download
-                            this.update('sync~spin', 'Downloading');
-                            await ollamaDownloadModel(inferenceConfig.endpoint, inferenceConfig.modelName, inferenceConfig.bearerToken);
-                            this.update('sync~spin', 'Llama Coder');
+                        // Check if user asked to ignore download
+                        if (this.context.globalState.get('llama-coder-download-ignored') === inferenceConfig.modelName) {
+                            info(`Ingoring since user asked to ignore download.`);
+                            return;
                         }
-                        if (token.isCancellationRequested) {
-                            info(`Canceled after AI completion.`);
+
+                        // Ask for download
+                        let download = await vscode.window.showInformationMessage(`Model ${inferenceConfig.modelName} is not downloaded. Do you want to download it? Answering "No" would require you to manually download model.`, 'Yes', 'No');
+                        if (download === 'No') {
+                            info(`Ingoring since user asked to ignore download.`);
+                            this.context.globalState.update('llama-coder-download-ignored', inferenceConfig.modelName);
                             return;
                         }
 
-                        // Run AI completion
-                        info(`Running AI completion...`);
-                        res = await autocomplete({
-                            prefix: prepared.prefix,
-                            suffix: prepared.suffix,
-                            endpoint: inferenceConfig.endpoint,
-                            bearerToken: inferenceConfig.bearerToken,
-                            model: inferenceConfig.modelName,
-                            maxLines: inferenceConfig.maxLines,
-                            maxTokens: inferenceConfig.maxTokens,
-                            temperature: inferenceConfig.temperature,
-                            canceled: () => token.isCancellationRequested,
-                        });
-                        info(`AI completion completed: ${res}`);
-
-                        // Put to cache
-                        setPromptToCache({
-                            prefix: prepared.prefix,
-                            suffix: prepared.suffix,
-                            value: res
-                        });
+                        // Perform download
+                        this.update('sync~spin', 'Downloading');
+                        await ollamaDownloadModel(inferenceConfig.endpoint, inferenceConfig.modelName, inferenceConfig.bearerToken);
+                        this.update('sync~spin', 'Llama Coder');
+                    }
+                    if (token.isCancellationRequested) {
+                        info(`Canceled after AI completion.`);
+                        return;
+                    }
+
+                    // Run AI completion
+                    info(`Running AI completion...`);
+                    res = await autocomplete({
+                        prefix: prepared.prefix,
+                        suffix: prepared.suffix,
+                        endpoint: inferenceConfig.endpoint,
+                        bearerToken: inferenceConfig.bearerToken,
+                        model: inferenceConfig.modelName,
+                        maxLines: inferenceConfig.maxLines,
+                        maxTokens: inferenceConfig.maxTokens,
+                        temperature: inferenceConfig.temperature,
+                        canceled: () => token.isCancellationRequested,
+                    });
+                    info(`AI completion completed: ${res}`);
                     } finally {
                         this.update('chip', 'Llama Coder');
                     }
-                } else {
-                    if (cached !== null) {
-                        res = cached;
-                    }
-                }
                 if (token.isCancellationRequested) {
                     info(`Canceled after AI completion.`);
                     return;

From 005e08c5a586f420e4de06d9d68f403289716f06 Mon Sep 17 00:00:00 2001
From: Yousshim 
Date: Tue, 1 Apr 2025 15:44:01 +0200
Subject: [PATCH 4/6] remove auto model downloads

---
 src/modules/ollamaCheckModel.ts    | 21 -------------
 src/modules/ollamaDownloadModel.ts |  9 ------
 src/prompts/provider.ts            | 50 ++++++------------------------
 3 files changed, 9 insertions(+), 71 deletions(-)
 delete mode 100644 src/modules/ollamaCheckModel.ts
 delete mode 100644 src/modules/ollamaDownloadModel.ts

diff --git a/src/modules/ollamaCheckModel.ts b/src/modules/ollamaCheckModel.ts
deleted file mode 100644
index 5aedb8f..0000000
--- a/src/modules/ollamaCheckModel.ts
+++ /dev/null
@@ -1,21 +0,0 @@
-import { info } from "./log";
-
-export async function ollamaCheckModel(endpoint: string, model: string, bearerToken: string) {
-    // Check if exists
-    let res = await fetch(endpoint + '/api/tags', {
-      headers: bearerToken ? {
-            Authorization: `Bearer ${bearerToken}`,
-          } : {},
-    });
-    if (!res.ok) {
-        info(await res.text());
-        info(endpoint + '/api/tags');
-        throw Error('Network response was not ok.');
-    }
-    let body = await res.json() as { models: { name: string }[] };
-    if (body.models.find((v) => v.name === model)) {
-        return true;
-    } else {
-        return false;
-    }
-}
\ No newline at end of file
diff --git a/src/modules/ollamaDownloadModel.ts b/src/modules/ollamaDownloadModel.ts
deleted file mode 100644
index 7e6eccb..0000000
--- a/src/modules/ollamaDownloadModel.ts
+++ /dev/null
@@ -1,9 +0,0 @@
-import { lineGenerator } from "./lineGenerator";
-import { info } from "./log";
-
-export async function ollamaDownloadModel(endpoint: string, model: string, bearerToken: string) {
-    info('Downloading model from ollama: ' + model);
-    for await (let line of lineGenerator(endpoint + '/api/pull', { name: model }, bearerToken)) {
-        info('[DOWNLOAD] ' + line);
-    }
-}
\ No newline at end of file
diff --git a/src/prompts/provider.ts b/src/prompts/provider.ts
index 304db34..4d9c655 100644
--- a/src/prompts/provider.ts
+++ b/src/prompts/provider.ts
@@ -4,8 +4,6 @@ import { autocomplete } from './autocomplete';
 import { preparePrompt } from './preparePrompt';
 import { AsyncLock } from '../modules/lock';
 import { isNotNeeded, isSupported } from './filter';
-import { ollamaCheckModel } from '../modules/ollamaCheckModel';
-import { ollamaDownloadModel } from '../modules/ollamaDownloadModel';
 import { config } from '../config';
 
 type Status = {
@@ -104,42 +102,12 @@ export class PromptProvider implements vscode.InlineCompletionItemProvider {
                 // Result
                 let res: string | null = null;
 
-                    // Config
-                    let inferenceConfig = config.inference;
-
-                    // Update status
-                    this.update('sync~spin', 'Llama Coder');
-                    try {
-
-                        // Check model exists
-                        let modelExists = await ollamaCheckModel(inferenceConfig.endpoint, inferenceConfig.modelName, inferenceConfig.bearerToken);
-                        if (token.isCancellationRequested) {
-                            info(`Canceled after AI completion.`);
-                            return;
-                        }
-
-                        // Download model if not exists
-                        if (!modelExists) {
-
-                        // Check if user asked to ignore download
-                        if (this.context.globalState.get('llama-coder-download-ignored') === inferenceConfig.modelName) {
-                            info(`Ingoring since user asked to ignore download.`);
-                            return;
-                        }
-
-                        // Ask for download
-                        let download = await vscode.window.showInformationMessage(`Model ${inferenceConfig.modelName} is not downloaded. Do you want to download it? Answering "No" would require you to manually download model.`, 'Yes', 'No');
-                        if (download === 'No') {
-                            info(`Ingoring since user asked to ignore download.`);
-                            this.context.globalState.update('llama-coder-download-ignored', inferenceConfig.modelName);
-                            return;
-                        }
-
-                        // Perform download
-                        this.update('sync~spin', 'Downloading');
-                        await ollamaDownloadModel(inferenceConfig.endpoint, inferenceConfig.modelName, inferenceConfig.bearerToken);
-                        this.update('sync~spin', 'Llama Coder');
-                    }
+                // Config
+                let inferenceConfig = config.inference;
+
+                // Update status
+                this.update('sync~spin', 'Llama Coder');
+                try {
                     if (token.isCancellationRequested) {
                         info(`Canceled after AI completion.`);
                         return;
@@ -159,9 +127,9 @@ export class PromptProvider implements vscode.InlineCompletionItemProvider {
                         canceled: () => token.isCancellationRequested,
                     });
                     info(`AI completion completed: ${res}`);
-                    } finally {
-                        this.update('chip', 'Llama Coder');
-                    }
+                } finally {
+                    this.update('chip', 'Llama Coder');
+                }
                 if (token.isCancellationRequested) {
                     info(`Canceled after AI completion.`);
                     return;

From 1986d5cbfe6b2e7ad7b1d93b7e45ddbdd3515a02 Mon Sep 17 00:00:00 2001
From: Yousshim 
Date: Tue, 1 Apr 2025 16:03:56 +0200
Subject: [PATCH 5/6] Remove block proecessing

---
 src/prompts/autocomplete.ts | 56 ++-----------------------------------
 1 file changed, 3 insertions(+), 53 deletions(-)

diff --git a/src/prompts/autocomplete.ts b/src/prompts/autocomplete.ts
index 8925890..94babe8 100644
--- a/src/prompts/autocomplete.ts
+++ b/src/prompts/autocomplete.ts
@@ -29,71 +29,21 @@ export async function autocomplete(args: {
     // Receiving tokens
     let res = '';
     let totalLines = 1;
-    let blockStack: ('[' | '(' | '{')[] = [];
-    outer: for await (let tokens of ollamaTokenGenerator(args.endpoint + '/api/generate', data, args.bearerToken)) {
+    for await (let tokens of ollamaTokenGenerator(args.endpoint + '/api/generate', data, args.bearerToken)) {
         if (args.canceled && args.canceled()) {
             break;
         }
 
-        // Block stack
-        for (let c of tokens.response) {
-
-            // Open block
-            if (c === '[') {
-                blockStack.push('[');
-            } else if (c === '(') {
-                blockStack.push('(');
-            }
-            if (c === '{') {
-                blockStack.push('{');
-            }
-
-            // Close block
-            if (c === ']') {
-                if (blockStack.length > 0 && blockStack[blockStack.length - 1] === '[') {
-                    blockStack.pop();
-                } else {
-                    info('Block stack error, breaking.');
-                    break outer;
-                }
-            }
-            if (c === ')') {
-                if (blockStack.length > 0 && blockStack[blockStack.length - 1] === '(') {
-                    blockStack.pop();
-                } else {
-                    info('Block stack error, breaking.');
-                    break outer;
-                }
-            }
-            if (c === '}') {
-                if (blockStack.length > 0 && blockStack[blockStack.length - 1] === '{') {
-                    blockStack.pop();
-                } else {
-                    info('Block stack error, breaking.');
-                    break outer;
-                }
-            }
-
-            // Append charater
-            res += c;
-        }
+        res = res + tokens.response;
 
         // Update total lines
         totalLines += countSymbol(tokens.response, '\n');
         // Break if too many lines and on top level
-        if (totalLines > args.maxLines && blockStack.length === 0) {
+        if (totalLines > args.maxLines) {
             info('Too many lines, breaking.');
             break;
         }
     }
 
-    // Remove 
-    if (res.endsWith('')) {
-        res = res.slice(0, res.length - 5);
-    }
-
-    // Trim ends of all lines since sometimes the AI completion will add extra spaces
-    res = res.split('\n').map((v) => v.trimEnd()).join('\n');
-
     return res;
 }
\ No newline at end of file

From d1d86cb0c50db78c8840546e7c1b040fb9ab812a Mon Sep 17 00:00:00 2001
From: Yousshim 
Date: Tue, 1 Apr 2025 16:48:05 +0200
Subject: [PATCH 6/6] Use ollama without streaming

---
 src/modules/lineGenerator.ts        | 54 -----------------------------
 src/modules/ollamaRequest.ts        | 35 +++++++++++++++++++
 src/modules/ollamaTokenGenerator.ts | 22 ------------
 src/prompts/autocomplete.ts         | 40 ++++++++++-----------
 4 files changed, 55 insertions(+), 96 deletions(-)
 delete mode 100644 src/modules/lineGenerator.ts
 create mode 100644 src/modules/ollamaRequest.ts
 delete mode 100644 src/modules/ollamaTokenGenerator.ts

diff --git a/src/modules/lineGenerator.ts b/src/modules/lineGenerator.ts
deleted file mode 100644
index f20081f..0000000
--- a/src/modules/lineGenerator.ts
+++ /dev/null
@@ -1,54 +0,0 @@
-export async function* lineGenerator(url: string, data: any, bearerToken: string): AsyncGenerator {
-    // Request
-    const controller = new AbortController();
-    let res = await fetch(url, {
-      method: 'POST',
-      body: JSON.stringify(data),
-      headers: bearerToken ? {
-            'Content-Type': 'application/json',
-            Authorization: `Bearer ${bearerToken}`,
-          } : {
-            'Content-Type': 'application/json',
-          },
-      signal: controller.signal,
-    });
-    if (!res.ok || !res.body) {
-        throw Error('Unable to connect to backend');
-    }
-
-    // Reading stream
-    let stream = res.body.getReader();
-    const decoder = new TextDecoder();
-    let pending: string = '';
-    try {
-        while (true) {
-            const { done, value } = await stream.read();
-
-            // If ended
-            if (done) {
-                if (pending.length > 0) { // New lines are impossible here
-                    yield pending;
-                }
-                break;
-            }
-
-            // Append chunk
-            let chunk = decoder.decode(value);
-            console.warn(chunk);
-            pending += chunk;
-
-            // Yield results 
-            while (pending.indexOf('\n') >= 0) {
-                let offset = pending.indexOf('\n');
-                yield pending.slice(0, offset);
-                pending = pending.slice(offset + 1);
-            }
-        }
-    } finally {
-        stream.releaseLock();
-        if (!stream.closed) { // Stop generation
-            await stream.cancel();
-        }
-        controller.abort();
-    }
-}
\ No newline at end of file
diff --git a/src/modules/ollamaRequest.ts b/src/modules/ollamaRequest.ts
new file mode 100644
index 0000000..cbf8d93
--- /dev/null
+++ b/src/modules/ollamaRequest.ts
@@ -0,0 +1,35 @@
+export async function makeOllamaRequest(url: string, data: any, bearerToken: string): Promise {
+    // Request
+    const controller = new AbortController();
+    let res = await fetch(url, {
+      method: 'POST',
+      body: JSON.stringify(data),
+      headers: bearerToken ? {
+            'Content-Type': 'application/json',
+            Authorization: `Bearer ${bearerToken}`,
+          } : {
+            'Content-Type': 'application/json',
+          },
+      signal: controller.signal,
+    });
+    if (!res.ok || !res.body) {
+        throw Error('Unable to connect to backend');
+    }
+
+    // Reading stream
+    let stream = res.body.getReader();
+    const decoder = new TextDecoder();
+    try {
+        const { value } = await stream.read();
+
+        // Append chunk
+        let chunk = decoder.decode(value);
+        return chunk;
+    } finally {
+        stream.releaseLock();
+        if (!stream.closed) { // Stop generation
+            await stream.cancel();
+        }
+        controller.abort();
+    }
+}
\ No newline at end of file
diff --git a/src/modules/ollamaTokenGenerator.ts b/src/modules/ollamaTokenGenerator.ts
deleted file mode 100644
index f57fef9..0000000
--- a/src/modules/ollamaTokenGenerator.ts
+++ /dev/null
@@ -1,22 +0,0 @@
-import { lineGenerator } from "./lineGenerator";
-import { info } from "./log";
-
-export type OllamaToken = {
-    model: string,
-    response: string,
-    done: boolean
-};
-
-export async function* ollamaTokenGenerator(url: string, data: any, bearerToken: string): AsyncGenerator {
-    for await (let line of lineGenerator(url, data, bearerToken)) {
-        info('Receive line: ' + line);
-        let parsed: OllamaToken;
-        try {
-            parsed = JSON.parse(line) as OllamaToken;
-        } catch (e) { 
-            console.warn('Receive wrong line: ' + line);
-            continue;
-        }
-        yield parsed;
-    }
-}
\ No newline at end of file
diff --git a/src/prompts/autocomplete.ts b/src/prompts/autocomplete.ts
index 94babe8..3415d17 100644
--- a/src/prompts/autocomplete.ts
+++ b/src/prompts/autocomplete.ts
@@ -1,6 +1,9 @@
-import { ollamaTokenGenerator } from '../modules/ollamaTokenGenerator';
-import { countSymbol } from '../modules/text';
-import { info } from '../modules/log';
+import { makeOllamaRequest } from "../modules/ollamaRequest";
+
+type OllamaToken = {
+    model: string,
+    response: string,
+};
 
 export async function autocomplete(args: {
     endpoint: string,
@@ -20,30 +23,27 @@ export async function autocomplete(args: {
         prompt: args.prefix,
         suffix: args.suffix,
         raw: true,
+        stream: false,
         options: {
             num_predict: args.maxTokens,
             temperature: args.temperature
         }
     };
 
-    // Receiving tokens
-    let res = '';
-    let totalLines = 1;
-    for await (let tokens of ollamaTokenGenerator(args.endpoint + '/api/generate', data, args.bearerToken)) {
+    const res = await makeOllamaRequest(args.endpoint + '/api/generate', data, args.bearerToken);
+    try {
+        const tokens =  JSON.parse(res) as OllamaToken;
         if (args.canceled && args.canceled()) {
-            break;
-        }
-
-        res = res + tokens.response;
-
-        // Update total lines
-        totalLines += countSymbol(tokens.response, '\n');
-        // Break if too many lines and on top level
-        if (totalLines > args.maxLines) {
-            info('Too many lines, breaking.');
-            break;
+            return "";
         }
+        const response = tokens.response;
+        
+        // take only args.maLines lines from the response
+        let lines = response.split('\n');
+        lines = lines.slice(0, args.maxLines);
+        return lines.join('\n');
+    } catch (e) { 
+        console.warn('Receive wrong line: ' + res);
+        return "";
     }
-
-    return res;
 }
\ No newline at end of file