@@ -78,6 +78,80 @@ public class OllamaEmbeddingOptions implements EmbeddingOptions {
7878 @ JsonProperty ("truncate" )
7979 private Boolean truncate ;
8080
81+ // @formatter:off
82+
83+ /**
84+ * Whether to use NUMA. (Default: false)
85+ */
86+ @ JsonProperty ("numa" )
87+ private Boolean useNUMA ;
88+
89+ /**
90+ * Prompt processing maximum batch size. (Default: 512)
91+ */
92+ @ JsonProperty ("num_batch" )
93+ private Integer numBatch ;
94+
95+ /**
96+ * The number of layers to send to the GPU(s). On macOS, it defaults to 1
97+ * to enable metal support, 0 to disable.
98+ * (Default: -1, which indicates that numGPU should be set dynamically)
99+ */
100+ @ JsonProperty ("num_gpu" )
101+ private Integer numGPU ;
102+
103+ /**
104+ * When using multiple GPUs this option controls which GPU is used
105+ * for small tensors for which the overhead of splitting the computation
106+ * across all GPUs is not worthwhile. The GPU in question will use slightly
107+ * more VRAM to store a scratch buffer for temporary results.
108+ * By default, GPU 0 is used.
109+ */
110+ @ JsonProperty ("main_gpu" )
111+ private Integer mainGPU ;
112+
113+ /**
114+ * (Default: false)
115+ */
116+ @ JsonProperty ("low_vram" )
117+ private Boolean lowVRAM ;
118+
119+ /**
120+ * Load only the vocabulary, not the weights.
121+ */
122+ @ JsonProperty ("vocab_only" )
123+ private Boolean vocabOnly ;
124+
125+ /**
126+ * By default, models are mapped into memory, which allows the system to load only the necessary parts
127+ * of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low
128+ * on available memory, using mmap might increase the risk of pageouts, negatively impacting performance.
129+ * Disabling mmap results in slower load times but may reduce pageouts if you're not using mlock.
130+ * Note that if the model is larger than the total amount of RAM, turning off mmap would prevent
131+ * the model from loading at all.
132+ * (Default: null)
133+ */
134+ @ JsonProperty ("use_mmap" )
135+ private Boolean useMMap ;
136+
137+ /**
138+ * Lock the model in memory, preventing it from being swapped out when memory-mapped.
139+ * This can improve performance but trades away some of the advantages of memory-mapping
140+ * by requiring more RAM to run and potentially slowing down load times as the model loads into RAM.
141+ * (Default: false)
142+ */
143+ @ JsonProperty ("use_mlock" )
144+ private Boolean useMLock ;
145+
146+ /**
147+ * Set the number of threads to use during generation. For optimal performance, it is recommended to set this value
148+ * to the number of physical CPU cores your system has (as opposed to the logical number of cores).
149+ * Using the correct number of threads can greatly improve performance.
150+ * By default, Ollama will detect this value for optimal performance.
151+ */
152+ @ JsonProperty ("num_thread" )
153+ private Integer numThread ;
154+
81155 public static Builder builder () {
82156 return new Builder ();
83157 }
@@ -93,19 +167,37 @@ public static Map<String, Object> filterNonSupportedFields(Map<String, Object> o
93167 .collect (Collectors .toMap (Map .Entry ::getKey , Map .Entry ::getValue ));
94168 }
95169
96- public static OllamaEmbeddingOptions fromOptions (OllamaOptions fromOptions ) {
170+ public static OllamaEmbeddingOptions fromOptions (OllamaEmbeddingOptions fromOptions ) {
97171 return builder ()
98172 .model (fromOptions .getModel ())
99173 .keepAlive (fromOptions .getKeepAlive ())
100174 .truncate (fromOptions .getTruncate ())
175+ .useNUMA (fromOptions .getUseNUMA ())
176+ .numBatch (fromOptions .getNumBatch ())
177+ .numGPU (fromOptions .getNumGPU ())
178+ .mainGPU (fromOptions .getMainGPU ())
179+ .lowVRAM (fromOptions .getLowVRAM ())
180+ .vocabOnly (fromOptions .getVocabOnly ())
181+ .useMMap (fromOptions .getUseMMap ())
182+ .useMLock (fromOptions .getUseMLock ())
183+ .numThread (fromOptions .getNumThread ())
101184 .build ();
102185 }
103186
104- public static OllamaEmbeddingOptions fromOptions (OllamaEmbeddingOptions fromOptions ) {
187+ public static OllamaEmbeddingOptions fromOptions (OllamaOptions fromOptions ) {
105188 return builder ()
106189 .model (fromOptions .getModel ())
107190 .keepAlive (fromOptions .getKeepAlive ())
108191 .truncate (fromOptions .getTruncate ())
192+ .useNUMA (fromOptions .getUseNUMA ())
193+ .numBatch (fromOptions .getNumBatch ())
194+ .numGPU (fromOptions .getNumGPU ())
195+ .mainGPU (fromOptions .getMainGPU ())
196+ .lowVRAM (fromOptions .getLowVRAM ())
197+ .vocabOnly (fromOptions .getVocabOnly ())
198+ .useMMap (fromOptions .getUseMMap ())
199+ .useMLock (fromOptions .getUseMLock ())
200+ .numThread (fromOptions .getNumThread ())
109201 .build ();
110202 }
111203
@@ -137,6 +229,78 @@ public void setTruncate(Boolean truncate) {
137229 this .truncate = truncate ;
138230 }
139231
232+ public Boolean getUseNUMA () {
233+ return this .useNUMA ;
234+ }
235+
236+ public void setUseNUMA (Boolean useNUMA ) {
237+ this .useNUMA = useNUMA ;
238+ }
239+
240+ public Integer getNumBatch () {
241+ return this .numBatch ;
242+ }
243+
244+ public void setNumBatch (Integer numBatch ) {
245+ this .numBatch = numBatch ;
246+ }
247+
248+ public Integer getNumGPU () {
249+ return this .numGPU ;
250+ }
251+
252+ public void setNumGPU (Integer numGPU ) {
253+ this .numGPU = numGPU ;
254+ }
255+
256+ public Integer getMainGPU () {
257+ return this .mainGPU ;
258+ }
259+
260+ public void setMainGPU (Integer mainGPU ) {
261+ this .mainGPU = mainGPU ;
262+ }
263+
264+ public Boolean getLowVRAM () {
265+ return this .lowVRAM ;
266+ }
267+
268+ public void setLowVRAM (Boolean lowVRAM ) {
269+ this .lowVRAM = lowVRAM ;
270+ }
271+
272+ public Boolean getVocabOnly () {
273+ return this .vocabOnly ;
274+ }
275+
276+ public void setVocabOnly (Boolean vocabOnly ) {
277+ this .vocabOnly = vocabOnly ;
278+ }
279+
280+ public Boolean getUseMMap () {
281+ return this .useMMap ;
282+ }
283+
284+ public void setUseMMap (Boolean useMMap ) {
285+ this .useMMap = useMMap ;
286+ }
287+
288+ public Boolean getUseMLock () {
289+ return this .useMLock ;
290+ }
291+
292+ public void setUseMLock (Boolean useMLock ) {
293+ this .useMLock = useMLock ;
294+ }
295+
296+ public Integer getNumThread () {
297+ return this .numThread ;
298+ }
299+
300+ public void setNumThread (Integer numThread ) {
301+ this .numThread = numThread ;
302+ }
303+
140304 @ Override
141305 @ JsonIgnore
142306 public Integer getDimensions () {
@@ -198,6 +362,51 @@ public Builder truncate(Boolean truncate) {
198362 return this ;
199363 }
200364
365+ public Builder useNUMA (Boolean useNUMA ) {
366+ this .options .useNUMA = useNUMA ;
367+ return this ;
368+ }
369+
370+ public Builder numBatch (Integer numBatch ) {
371+ this .options .numBatch = numBatch ;
372+ return this ;
373+ }
374+
375+ public Builder numGPU (Integer numGPU ) {
376+ this .options .numGPU = numGPU ;
377+ return this ;
378+ }
379+
380+ public Builder mainGPU (Integer mainGPU ) {
381+ this .options .mainGPU = mainGPU ;
382+ return this ;
383+ }
384+
385+ public Builder lowVRAM (Boolean lowVRAM ) {
386+ this .options .lowVRAM = lowVRAM ;
387+ return this ;
388+ }
389+
390+ public Builder vocabOnly (Boolean vocabOnly ) {
391+ this .options .vocabOnly = vocabOnly ;
392+ return this ;
393+ }
394+
395+ public Builder useMMap (Boolean useMMap ) {
396+ this .options .useMMap = useMMap ;
397+ return this ;
398+ }
399+
400+ public Builder useMLock (Boolean useMLock ) {
401+ this .options .useMLock = useMLock ;
402+ return this ;
403+ }
404+
405+ public Builder numThread (Integer numThread ) {
406+ this .options .numThread = numThread ;
407+ return this ;
408+ }
409+
201410 public OllamaEmbeddingOptions build () {
202411 return this .options ;
203412 }
0 commit comments