Skip to content

Commit a92ef97

Browse files
committed
refactor: move llama.cpp config handling to Rust
- Removed duplicated TypeScript type definitions for LlamacppConfig, ModelPlan, DownloadItem, ModelConfig, etc. - Added a new `src/guest-js/types.ts` that exports the consolidated types and a helper `normalizeLlamacppConfig` for converting raw config objects. - Implemented a dedicated Rust module `args.rs` that builds all command‑line arguments for llama.cpp from a `LlamacppConfig` struct, handling embedding, flash‑attention, GPU/CPU flags, and other options. - Updated `commands.rs` to construct arguments via `ArgumentBuilder`, validate paths, and log the generated args. - Added more explicit error handling for invalid configuration arguments and updated the error enum to include `InvalidArgument`. - Exported the new `cleanupLlamaProcesses` command and updated the guest‑JS API accordingly. - Adjusted the TypeScript `loadLlamaModel` helper to use the new config normalization and argument shape. - Improved logging and documentation for clarity.
1 parent 053158b commit a92ef97

File tree

7 files changed

+686
-237
lines changed

7 files changed

+686
-237
lines changed

extensions/llamacpp-extension/src/index.ts

Lines changed: 33 additions & 193 deletions
Original file line numberDiff line numberDiff line change
@@ -44,106 +44,19 @@ import {
4444
isModelSupported,
4545
planModelLoadInternal,
4646
unloadLlamaModel,
47+
LlamacppConfig,
48+
ModelPlan,
49+
DownloadItem,
50+
ModelConfig,
51+
EmbeddingResponse,
52+
DeviceList,
53+
SystemMemory,
4754
} from '@janhq/tauri-plugin-llamacpp-api'
4855
import { getSystemUsage, getSystemInfo } from '@janhq/tauri-plugin-hardware-api'
4956

5057
// Error message constant - matches web-app/src/utils/error.ts
5158
const OUT_OF_CONTEXT_SIZE = 'the request exceeds the available context size.'
5259

53-
type LlamacppConfig = {
54-
version_backend: string
55-
auto_update_engine: boolean
56-
auto_unload: boolean
57-
timeout: number
58-
llamacpp_env: string
59-
memory_util: string
60-
chat_template: string
61-
n_gpu_layers: number
62-
offload_mmproj: boolean
63-
cpu_moe: boolean
64-
n_cpu_moe: number
65-
override_tensor_buffer_t: string
66-
ctx_size: number
67-
threads: number
68-
threads_batch: number
69-
n_predict: number
70-
batch_size: number
71-
ubatch_size: number
72-
device: string
73-
split_mode: string
74-
main_gpu: number
75-
flash_attn: string
76-
cont_batching: boolean
77-
no_mmap: boolean
78-
mlock: boolean
79-
no_kv_offload: boolean
80-
cache_type_k: string
81-
cache_type_v: string
82-
defrag_thold: number
83-
rope_scaling: string
84-
rope_scale: number
85-
rope_freq_base: number
86-
rope_freq_scale: number
87-
ctx_shift: boolean
88-
}
89-
90-
type ModelPlan = {
91-
gpuLayers: number
92-
maxContextLength: number
93-
noOffloadKVCache: boolean
94-
offloadMmproj?: boolean
95-
batchSize: number
96-
mode: 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
97-
}
98-
99-
interface DownloadItem {
100-
url: string
101-
save_path: string
102-
proxy?: Record<string, string | string[] | boolean>
103-
sha256?: string
104-
size?: number
105-
}
106-
107-
interface ModelConfig {
108-
model_path: string
109-
mmproj_path?: string
110-
name: string // user-friendly
111-
// some model info that we cache upon import
112-
size_bytes: number
113-
sha256?: string
114-
mmproj_sha256?: string
115-
mmproj_size_bytes?: number
116-
}
117-
118-
interface EmbeddingResponse {
119-
model: string
120-
object: string
121-
usage: {
122-
prompt_tokens: number
123-
total_tokens: number
124-
}
125-
data: EmbeddingData[]
126-
}
127-
128-
interface EmbeddingData {
129-
embedding: number[]
130-
index: number
131-
object: string
132-
}
133-
134-
interface DeviceList {
135-
id: string
136-
name: string
137-
mem: number
138-
free: number
139-
}
140-
141-
interface SystemMemory {
142-
totalVRAM: number
143-
totalRAM: number
144-
totalMemory: number
145-
}
146-
14760
/**
14861
* Override the default app.log function to use Jan's logging system.
14962
* @param args
@@ -556,6 +469,7 @@ export default class llamacpp_extension extends AIEngine {
556469
// Vulkan will be conditionally prioritized based on GPU memory
557470
const backendPriorities: string[] = hasEnoughGpuMemory
558471
? [
472+
'cuda-cu13.0',
559473
'cuda-cu12.0',
560474
'cuda-cu11.7',
561475
'vulkan',
@@ -568,6 +482,7 @@ export default class llamacpp_extension extends AIEngine {
568482
'x64',
569483
]
570484
: [
485+
'cuda-cu13.0',
571486
'cuda-cu12.0',
572487
'cuda-cu11.7',
573488
'common_cpus', // NEW: Unified CPU backend
@@ -582,6 +497,8 @@ export default class llamacpp_extension extends AIEngine {
582497

583498
// Helper to map backend string to a priority category
584499
const getBackendCategory = (backendString: string): string | undefined => {
500+
if (backendString.includes('cuda-13-common_cpus')) return 'cuda-cu13.0'
501+
585502
if (
586503
backendString.includes('cuda-12-common_cpus') ||
587504
backendString.includes('cu12.0')
@@ -838,7 +755,7 @@ export default class llamacpp_extension extends AIEngine {
838755
return { updateNeeded: false, newVersion: '0' }
839756
}
840757

841-
const [latestVersion, latestBackend] = targetBackendString.split('/')
758+
const [latestVersion] = targetBackendString.split('/')
842759

843760
// Check if update is needed (version comparison)
844761
if (
@@ -1210,7 +1127,8 @@ export default class llamacpp_extension extends AIEngine {
12101127
// - k_llama-main-b4314-09c61e1-bin-win-cuda-12.8-x64-avx2.zip
12111128
// - ik_llama-main-b4314-09c61e1-cudart-llama-bin-win-cuda-12.8-x64-avx512.zip
12121129
// - llama-b7037-bin-win-cuda-12.4-x64.zip (legacy format)
1213-
const re = /^(.+?[-_])?llama(?:-main)?-(b\d+(?:-[a-f0-9]+)?)(?:-cudart-llama)?-bin-(.+?)\.(?:tar\.gz|zip)$/
1130+
const re =
1131+
/^(.+?[-_])?llama(?:-main)?-(b\d+(?:-[a-f0-9]+)?)(?:-cudart-llama)?-bin-(.+?)\.(?:tar\.gz|zip)$/
12141132

12151133
const archiveName = await basename(path)
12161134
logger.info(`Installing backend from path: ${path}`)
@@ -1667,7 +1585,7 @@ export default class llamacpp_extension extends AIEngine {
16671585
return await this.findSessionByModel(modelId)
16681586
} catch (e) {
16691587
logger.warn(`Unable to find session for model "${modelId}": ${e}`)
1670-
return null // treat as “not‑eligible for unload”
1588+
return null
16711589
}
16721590
})
16731591
)
@@ -1685,10 +1603,11 @@ export default class llamacpp_extension extends AIEngine {
16851603
}
16861604
}
16871605
}
1688-
const args: string[] = []
1606+
16891607
const envs: Record<string, string> = {}
16901608
const cfg = { ...this.config, ...(overrideSettings ?? {}) }
16911609
const [version, backend] = cfg.version_backend.split('/')
1610+
16921611
if (!version || !backend) {
16931612
throw new Error(
16941613
'Initial setup for the backend failed due to a network issue. Please restart the app!'
@@ -1710,120 +1629,41 @@ export default class llamacpp_extension extends AIEngine {
17101629
})
17111630
const port = await this.getRandomPort()
17121631

1713-
// disable llama-server webui
1714-
// TODO: Determine what's the best course of action here.
1715-
// Hopefully, we would want all the fork to have same set of arguments
1716-
// Otherwise it would become impossible to maintain
1717-
// Keeping this for now
1718-
if (!backend.startsWith('ik')) args.push('--no-webui')
1632+
// Generate API key
17191633
const api_key = await this.generateApiKey(modelId, String(port))
17201634
envs['LLAMA_API_KEY'] = api_key
17211635
envs['LLAMA_ARG_TIMEOUT'] = String(this.timeout)
17221636

1723-
// set user envs
1637+
// Set user envs
17241638
if (this.llamacpp_env) this.parseEnvFromString(envs, this.llamacpp_env)
17251639

1726-
// model option is required
1727-
// NOTE: model_path and mmproj_path can be either relative to Jan's data folder or absolute path
1640+
// Resolve model path
17281641
const modelPath = await joinPath([
17291642
janDataFolderPath,
17301643
modelConfig.model_path,
17311644
])
1732-
args.push('--jinja')
1733-
args.push('-m', modelPath)
1734-
if (cfg.cpu_moe) args.push('--cpu-moe')
1735-
if (cfg.n_cpu_moe && cfg.n_cpu_moe > 0) {
1736-
args.push('--n-cpu-moe', String(cfg.n_cpu_moe))
1737-
}
1738-
// For overriding tensor buffer type, useful where
1739-
// massive MOE models can be made faster by keeping attention on the GPU
1740-
// and offloading the expert FFNs to the CPU.
1741-
// This is an expert level settings and should only be used by people
1742-
// who knows what they are doing.
1743-
// Takes a regex with matching tensor name as input
1744-
if (cfg.override_tensor_buffer_t)
1745-
args.push('--override-tensor', cfg.override_tensor_buffer_t)
1746-
// offload multimodal projector model to the GPU by default. if there is not enough memory
1747-
// turn this setting off will keep the projector model on the CPU but the image processing can
1748-
// take longer
1749-
if (cfg.offload_mmproj === false) args.push('--no-mmproj-offload')
1750-
args.push('-a', modelId)
1751-
args.push('--port', String(port))
1752-
if (modelConfig.mmproj_path) {
1753-
const mmprojPath = await joinPath([
1754-
janDataFolderPath,
1755-
modelConfig.mmproj_path,
1756-
])
1757-
args.push('--mmproj', mmprojPath)
1758-
}
1759-
// Add remaining options from the interface
1760-
if (cfg.chat_template) args.push('--chat-template', cfg.chat_template)
1761-
const gpu_layers =
1762-
parseInt(String(cfg.n_gpu_layers)) >= 0 ? cfg.n_gpu_layers : 100
1763-
args.push('-ngl', String(gpu_layers))
1764-
if (cfg.threads > 0) args.push('--threads', String(cfg.threads))
1765-
if (cfg.threads_batch > 0)
1766-
args.push('--threads-batch', String(cfg.threads_batch))
1767-
if (cfg.batch_size > 0) args.push('--batch-size', String(cfg.batch_size))
1768-
if (cfg.ubatch_size > 0) args.push('--ubatch-size', String(cfg.ubatch_size))
1769-
if (cfg.device.length > 0) args.push('--device', cfg.device)
1770-
if (cfg.split_mode.length > 0 && cfg.split_mode != 'layer')
1771-
args.push('--split-mode', cfg.split_mode)
1772-
if (cfg.main_gpu !== undefined && cfg.main_gpu !== 0)
1773-
args.push('--main-gpu', String(cfg.main_gpu))
1774-
// Note: Older llama.cpp versions are no longer supported
1775-
if (
1776-
cfg.flash_attn !== undefined ||
1777-
(cfg.flash_attn !== 'auto' && // set argument only when the setting value is not auto
1778-
!backend.startsWith('ik')) // ik fork of llama.cpp doesn't support --flash-attn
1779-
) {
1780-
args.push('--flash-attn', String(cfg.flash_attn)) //default: auto = ON when supported
1781-
} else if (backend.startsWith('ik') && cfg.flash_attn == 'on') {
1782-
args.push('-fa') // hoping the ik fork is still using the old fa arguments
1783-
}
1784-
1785-
// Boolean flags
1786-
if (cfg.ctx_shift) args.push('--context-shift')
1787-
if (cfg.cont_batching) args.push('--cont-batching')
1788-
if (cfg.no_mmap) args.push('--no-mmap')
1789-
if (cfg.mlock) args.push('--mlock')
1790-
if (cfg.no_kv_offload) args.push('--no-kv-offload')
1791-
if (isEmbedding) {
1792-
args.push('--embedding')
1793-
args.push('--pooling', 'mean')
1794-
} else {
1795-
if (cfg.ctx_size > 0) args.push('--ctx-size', String(cfg.ctx_size))
1796-
if (cfg.n_predict > 0) args.push('--n-predict', String(cfg.n_predict))
1797-
if (cfg.cache_type_k && cfg.cache_type_k != 'f16')
1798-
args.push('--cache-type-k', cfg.cache_type_k)
1799-
if (
1800-
cfg.flash_attn !== 'on' &&
1801-
cfg.cache_type_v != 'f16' &&
1802-
cfg.cache_type_v != 'f32'
1803-
) {
1804-
args.push('--cache-type-v', cfg.cache_type_v)
1805-
}
1806-
if (cfg.defrag_thold && cfg.defrag_thold != 0.1)
1807-
args.push('--defrag-thold', String(cfg.defrag_thold))
18081645

1809-
if (cfg.rope_scaling && cfg.rope_scaling != 'none')
1810-
args.push('--rope-scaling', cfg.rope_scaling)
1811-
if (cfg.rope_scale && cfg.rope_scale != 1)
1812-
args.push('--rope-scale', String(cfg.rope_scale))
1813-
if (cfg.rope_freq_base && cfg.rope_freq_base != 0)
1814-
args.push('--rope-freq-base', String(cfg.rope_freq_base))
1815-
if (cfg.rope_freq_scale && cfg.rope_freq_scale != 1)
1816-
args.push('--rope-freq-scale', String(cfg.rope_freq_scale))
1646+
// Resolve mmproj path if present
1647+
let mmprojPath: string | undefined = undefined
1648+
if (modelConfig.mmproj_path) {
1649+
mmprojPath = await joinPath([janDataFolderPath, modelConfig.mmproj_path])
18171650
}
18181651

1819-
logger.info('Calling Tauri command llama_load with args:', args)
1652+
logger.info(
1653+
'Calling Tauri command load_llama_model with config:',
1654+
JSON.stringify(cfg)
1655+
)
18201656
const backendPath = await getBackendExePath(backend, version)
18211657

18221658
try {
18231659
const sInfo = await loadLlamaModel(
18241660
backendPath,
1825-
args,
1661+
modelId,
1662+
modelPath,
1663+
port,
1664+
cfg,
18261665
envs,
1666+
mmprojPath,
18271667
isEmbedding,
18281668
Number(this.timeout)
18291669
)

0 commit comments

Comments
 (0)