@@ -44,106 +44,19 @@ import {
4444 isModelSupported ,
4545 planModelLoadInternal ,
4646 unloadLlamaModel ,
47+ LlamacppConfig ,
48+ ModelPlan ,
49+ DownloadItem ,
50+ ModelConfig ,
51+ EmbeddingResponse ,
52+ DeviceList ,
53+ SystemMemory ,
4754} from '@janhq/tauri-plugin-llamacpp-api'
4855import { getSystemUsage , getSystemInfo } from '@janhq/tauri-plugin-hardware-api'
4956
5057// Error message constant - matches web-app/src/utils/error.ts
5158const OUT_OF_CONTEXT_SIZE = 'the request exceeds the available context size.'
5259
53- type LlamacppConfig = {
54- version_backend : string
55- auto_update_engine : boolean
56- auto_unload : boolean
57- timeout : number
58- llamacpp_env : string
59- memory_util : string
60- chat_template : string
61- n_gpu_layers : number
62- offload_mmproj : boolean
63- cpu_moe : boolean
64- n_cpu_moe : number
65- override_tensor_buffer_t : string
66- ctx_size : number
67- threads : number
68- threads_batch : number
69- n_predict : number
70- batch_size : number
71- ubatch_size : number
72- device : string
73- split_mode : string
74- main_gpu : number
75- flash_attn : string
76- cont_batching : boolean
77- no_mmap : boolean
78- mlock : boolean
79- no_kv_offload : boolean
80- cache_type_k : string
81- cache_type_v : string
82- defrag_thold : number
83- rope_scaling : string
84- rope_scale : number
85- rope_freq_base : number
86- rope_freq_scale : number
87- ctx_shift : boolean
88- }
89-
90- type ModelPlan = {
91- gpuLayers : number
92- maxContextLength : number
93- noOffloadKVCache : boolean
94- offloadMmproj ?: boolean
95- batchSize : number
96- mode : 'GPU' | 'Hybrid' | 'CPU' | 'Unsupported'
97- }
98-
99- interface DownloadItem {
100- url : string
101- save_path : string
102- proxy ?: Record < string , string | string [ ] | boolean >
103- sha256 ?: string
104- size ?: number
105- }
106-
107- interface ModelConfig {
108- model_path : string
109- mmproj_path ?: string
110- name : string // user-friendly
111- // some model info that we cache upon import
112- size_bytes : number
113- sha256 ?: string
114- mmproj_sha256 ?: string
115- mmproj_size_bytes ?: number
116- }
117-
118- interface EmbeddingResponse {
119- model : string
120- object : string
121- usage : {
122- prompt_tokens : number
123- total_tokens : number
124- }
125- data : EmbeddingData [ ]
126- }
127-
128- interface EmbeddingData {
129- embedding : number [ ]
130- index : number
131- object : string
132- }
133-
134- interface DeviceList {
135- id : string
136- name : string
137- mem : number
138- free : number
139- }
140-
141- interface SystemMemory {
142- totalVRAM : number
143- totalRAM : number
144- totalMemory : number
145- }
146-
14760/**
14861 * Override the default app.log function to use Jan's logging system.
14962 * @param args
@@ -556,6 +469,7 @@ export default class llamacpp_extension extends AIEngine {
556469 // Vulkan will be conditionally prioritized based on GPU memory
557470 const backendPriorities : string [ ] = hasEnoughGpuMemory
558471 ? [
472+ 'cuda-cu13.0' ,
559473 'cuda-cu12.0' ,
560474 'cuda-cu11.7' ,
561475 'vulkan' ,
@@ -568,6 +482,7 @@ export default class llamacpp_extension extends AIEngine {
568482 'x64' ,
569483 ]
570484 : [
485+ 'cuda-cu13.0' ,
571486 'cuda-cu12.0' ,
572487 'cuda-cu11.7' ,
573488 'common_cpus' , // NEW: Unified CPU backend
@@ -582,6 +497,8 @@ export default class llamacpp_extension extends AIEngine {
582497
583498 // Helper to map backend string to a priority category
584499 const getBackendCategory = ( backendString : string ) : string | undefined => {
500+ if ( backendString . includes ( 'cuda-13-common_cpus' ) ) return 'cuda-cu13.0'
501+
585502 if (
586503 backendString . includes ( 'cuda-12-common_cpus' ) ||
587504 backendString . includes ( 'cu12.0' )
@@ -838,7 +755,7 @@ export default class llamacpp_extension extends AIEngine {
838755 return { updateNeeded : false , newVersion : '0' }
839756 }
840757
841- const [ latestVersion , latestBackend ] = targetBackendString . split ( '/' )
758+ const [ latestVersion ] = targetBackendString . split ( '/' )
842759
843760 // Check if update is needed (version comparison)
844761 if (
@@ -1210,7 +1127,8 @@ export default class llamacpp_extension extends AIEngine {
12101127 // - k_llama-main-b4314-09c61e1-bin-win-cuda-12.8-x64-avx2.zip
12111128 // - ik_llama-main-b4314-09c61e1-cudart-llama-bin-win-cuda-12.8-x64-avx512.zip
12121129 // - llama-b7037-bin-win-cuda-12.4-x64.zip (legacy format)
1213- const re = / ^ ( .+ ?[ - _ ] ) ? l l a m a (?: - m a i n ) ? - ( b \d + (?: - [ a - f 0 - 9 ] + ) ? ) (?: - c u d a r t - l l a m a ) ? - b i n - ( .+ ?) \. (?: t a r \. g z | z i p ) $ /
1130+ const re =
1131+ / ^ ( .+ ?[ - _ ] ) ? l l a m a (?: - m a i n ) ? - ( b \d + (?: - [ a - f 0 - 9 ] + ) ? ) (?: - c u d a r t - l l a m a ) ? - b i n - ( .+ ?) \. (?: t a r \. g z | z i p ) $ /
12141132
12151133 const archiveName = await basename ( path )
12161134 logger . info ( `Installing backend from path: ${ path } ` )
@@ -1667,7 +1585,7 @@ export default class llamacpp_extension extends AIEngine {
16671585 return await this . findSessionByModel ( modelId )
16681586 } catch ( e ) {
16691587 logger . warn ( `Unable to find session for model "${ modelId } ": ${ e } ` )
1670- return null // treat as “not‑eligible for unload”
1588+ return null
16711589 }
16721590 } )
16731591 )
@@ -1685,10 +1603,11 @@ export default class llamacpp_extension extends AIEngine {
16851603 }
16861604 }
16871605 }
1688- const args : string [ ] = [ ]
1606+
16891607 const envs : Record < string , string > = { }
16901608 const cfg = { ...this . config , ...( overrideSettings ?? { } ) }
16911609 const [ version , backend ] = cfg . version_backend . split ( '/' )
1610+
16921611 if ( ! version || ! backend ) {
16931612 throw new Error (
16941613 'Initial setup for the backend failed due to a network issue. Please restart the app!'
@@ -1710,120 +1629,41 @@ export default class llamacpp_extension extends AIEngine {
17101629 } )
17111630 const port = await this . getRandomPort ( )
17121631
1713- // disable llama-server webui
1714- // TODO: Determine what's the best course of action here.
1715- // Hopefully, we would want all the fork to have same set of arguments
1716- // Otherwise it would become impossible to maintain
1717- // Keeping this for now
1718- if ( ! backend . startsWith ( 'ik' ) ) args . push ( '--no-webui' )
1632+ // Generate API key
17191633 const api_key = await this . generateApiKey ( modelId , String ( port ) )
17201634 envs [ 'LLAMA_API_KEY' ] = api_key
17211635 envs [ 'LLAMA_ARG_TIMEOUT' ] = String ( this . timeout )
17221636
1723- // set user envs
1637+ // Set user envs
17241638 if ( this . llamacpp_env ) this . parseEnvFromString ( envs , this . llamacpp_env )
17251639
1726- // model option is required
1727- // NOTE: model_path and mmproj_path can be either relative to Jan's data folder or absolute path
1640+ // Resolve model path
17281641 const modelPath = await joinPath ( [
17291642 janDataFolderPath ,
17301643 modelConfig . model_path ,
17311644 ] )
1732- args . push ( '--jinja' )
1733- args . push ( '-m' , modelPath )
1734- if ( cfg . cpu_moe ) args . push ( '--cpu-moe' )
1735- if ( cfg . n_cpu_moe && cfg . n_cpu_moe > 0 ) {
1736- args . push ( '--n-cpu-moe' , String ( cfg . n_cpu_moe ) )
1737- }
1738- // For overriding tensor buffer type, useful where
1739- // massive MOE models can be made faster by keeping attention on the GPU
1740- // and offloading the expert FFNs to the CPU.
1741- // This is an expert level settings and should only be used by people
1742- // who knows what they are doing.
1743- // Takes a regex with matching tensor name as input
1744- if ( cfg . override_tensor_buffer_t )
1745- args . push ( '--override-tensor' , cfg . override_tensor_buffer_t )
1746- // offload multimodal projector model to the GPU by default. if there is not enough memory
1747- // turn this setting off will keep the projector model on the CPU but the image processing can
1748- // take longer
1749- if ( cfg . offload_mmproj === false ) args . push ( '--no-mmproj-offload' )
1750- args . push ( '-a' , modelId )
1751- args . push ( '--port' , String ( port ) )
1752- if ( modelConfig . mmproj_path ) {
1753- const mmprojPath = await joinPath ( [
1754- janDataFolderPath ,
1755- modelConfig . mmproj_path ,
1756- ] )
1757- args . push ( '--mmproj' , mmprojPath )
1758- }
1759- // Add remaining options from the interface
1760- if ( cfg . chat_template ) args . push ( '--chat-template' , cfg . chat_template )
1761- const gpu_layers =
1762- parseInt ( String ( cfg . n_gpu_layers ) ) >= 0 ? cfg . n_gpu_layers : 100
1763- args . push ( '-ngl' , String ( gpu_layers ) )
1764- if ( cfg . threads > 0 ) args . push ( '--threads' , String ( cfg . threads ) )
1765- if ( cfg . threads_batch > 0 )
1766- args . push ( '--threads-batch' , String ( cfg . threads_batch ) )
1767- if ( cfg . batch_size > 0 ) args . push ( '--batch-size' , String ( cfg . batch_size ) )
1768- if ( cfg . ubatch_size > 0 ) args . push ( '--ubatch-size' , String ( cfg . ubatch_size ) )
1769- if ( cfg . device . length > 0 ) args . push ( '--device' , cfg . device )
1770- if ( cfg . split_mode . length > 0 && cfg . split_mode != 'layer' )
1771- args . push ( '--split-mode' , cfg . split_mode )
1772- if ( cfg . main_gpu !== undefined && cfg . main_gpu !== 0 )
1773- args . push ( '--main-gpu' , String ( cfg . main_gpu ) )
1774- // Note: Older llama.cpp versions are no longer supported
1775- if (
1776- cfg . flash_attn !== undefined ||
1777- ( cfg . flash_attn !== 'auto' && // set argument only when the setting value is not auto
1778- ! backend . startsWith ( 'ik' ) ) // ik fork of llama.cpp doesn't support --flash-attn
1779- ) {
1780- args . push ( '--flash-attn' , String ( cfg . flash_attn ) ) //default: auto = ON when supported
1781- } else if ( backend . startsWith ( 'ik' ) && cfg . flash_attn == 'on' ) {
1782- args . push ( '-fa' ) // hoping the ik fork is still using the old fa arguments
1783- }
1784-
1785- // Boolean flags
1786- if ( cfg . ctx_shift ) args . push ( '--context-shift' )
1787- if ( cfg . cont_batching ) args . push ( '--cont-batching' )
1788- if ( cfg . no_mmap ) args . push ( '--no-mmap' )
1789- if ( cfg . mlock ) args . push ( '--mlock' )
1790- if ( cfg . no_kv_offload ) args . push ( '--no-kv-offload' )
1791- if ( isEmbedding ) {
1792- args . push ( '--embedding' )
1793- args . push ( '--pooling' , 'mean' )
1794- } else {
1795- if ( cfg . ctx_size > 0 ) args . push ( '--ctx-size' , String ( cfg . ctx_size ) )
1796- if ( cfg . n_predict > 0 ) args . push ( '--n-predict' , String ( cfg . n_predict ) )
1797- if ( cfg . cache_type_k && cfg . cache_type_k != 'f16' )
1798- args . push ( '--cache-type-k' , cfg . cache_type_k )
1799- if (
1800- cfg . flash_attn !== 'on' &&
1801- cfg . cache_type_v != 'f16' &&
1802- cfg . cache_type_v != 'f32'
1803- ) {
1804- args . push ( '--cache-type-v' , cfg . cache_type_v )
1805- }
1806- if ( cfg . defrag_thold && cfg . defrag_thold != 0.1 )
1807- args . push ( '--defrag-thold' , String ( cfg . defrag_thold ) )
18081645
1809- if ( cfg . rope_scaling && cfg . rope_scaling != 'none' )
1810- args . push ( '--rope-scaling' , cfg . rope_scaling )
1811- if ( cfg . rope_scale && cfg . rope_scale != 1 )
1812- args . push ( '--rope-scale' , String ( cfg . rope_scale ) )
1813- if ( cfg . rope_freq_base && cfg . rope_freq_base != 0 )
1814- args . push ( '--rope-freq-base' , String ( cfg . rope_freq_base ) )
1815- if ( cfg . rope_freq_scale && cfg . rope_freq_scale != 1 )
1816- args . push ( '--rope-freq-scale' , String ( cfg . rope_freq_scale ) )
1646+ // Resolve mmproj path if present
1647+ let mmprojPath : string | undefined = undefined
1648+ if ( modelConfig . mmproj_path ) {
1649+ mmprojPath = await joinPath ( [ janDataFolderPath , modelConfig . mmproj_path ] )
18171650 }
18181651
1819- logger . info ( 'Calling Tauri command llama_load with args:' , args )
1652+ logger . info (
1653+ 'Calling Tauri command load_llama_model with config:' ,
1654+ JSON . stringify ( cfg )
1655+ )
18201656 const backendPath = await getBackendExePath ( backend , version )
18211657
18221658 try {
18231659 const sInfo = await loadLlamaModel (
18241660 backendPath ,
1825- args ,
1661+ modelId ,
1662+ modelPath ,
1663+ port ,
1664+ cfg ,
18261665 envs ,
1666+ mmprojPath ,
18271667 isEmbedding ,
18281668 Number ( this . timeout )
18291669 )
0 commit comments