Remove redundant numGQA option from Ollama options

tzolov · tzolov · commit b9593fe6f050 · 2024-07-25T09:29:29.000+02:00
diff --git a/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaOptions.java b/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaOptions.java
@@ -71,12 +71,6 @@ public class OllamaOptions implements FunctionCallingOptions, ChatOptions, Embed
 	 */
 	@JsonProperty("num_batch") private Integer numBatch;
 
-	/**
-	 * The number of GQA groups in the transformer layer. Required for some models,
-	 * for example it is 8 for llama2:70b.
-	 */
-	@JsonProperty("num_gqa") private Integer numGQA;
-
 	/**
 	 * The number of layers to send to the GPU(s). On macOS, it defaults to 1
 	 * to enable metal support, 0 to disable.
@@ -327,11 +321,6 @@ public OllamaOptions withNumBatch(Integer numBatch) {
 		return this;
 	}
 
-	public OllamaOptions withNumGQA(Integer numGQA) {
-		this.numGQA = numGQA;
-		return this;
-	}
-
 	public OllamaOptions withNumGPU(Integer numGPU) {
 		this.numGPU = numGPU;
 		return this;
@@ -518,14 +507,6 @@ public void setNumBatch(Integer numBatch) {
 		this.numBatch = numBatch;
 	}
 
-	public Integer getNumGQA() {
-		return this.numGQA;
-	}
-
-	public void setNumGQA(Integer numGQA) {
-		this.numGQA = numGQA;
-	}
-
 	public Integer getNumGPU() {
 		return this.numGPU;
 	}
@@ -795,7 +776,6 @@ public static OllamaOptions fromOptions(OllamaOptions fromOptions) {
 			.withUseNUMA(fromOptions.getUseNUMA())
 			.withNumCtx(fromOptions.getNumCtx())
 			.withNumBatch(fromOptions.getNumBatch())
-			.withNumGQA(fromOptions.getNumGQA())
 			.withNumGPU(fromOptions.getNumGPU())
 			.withMainGPU(fromOptions.getMainGPU())
 			.withLowVRAM(fromOptions.getLowVRAM())
@@ -837,14 +817,13 @@ public boolean equals(Object o) {
 		return Objects.equals(model, that.model) && Objects.equals(format, that.format)
 				&& Objects.equals(keepAlive, that.keepAlive) && Objects.equals(useNUMA, that.useNUMA)
 				&& Objects.equals(numCtx, that.numCtx) && Objects.equals(numBatch, that.numBatch)
-				&& Objects.equals(numGQA, that.numGQA) && Objects.equals(numGPU, that.numGPU)
-				&& Objects.equals(mainGPU, that.mainGPU) && Objects.equals(lowVRAM, that.lowVRAM)
-				&& Objects.equals(f16KV, that.f16KV) && Objects.equals(logitsAll, that.logitsAll)
-				&& Objects.equals(vocabOnly, that.vocabOnly) && Objects.equals(useMMap, that.useMMap)
-				&& Objects.equals(useMLock, that.useMLock) && Objects.equals(numThread, that.numThread)
-				&& Objects.equals(numKeep, that.numKeep) && Objects.equals(seed, that.seed)
-				&& Objects.equals(numPredict, that.numPredict) && Objects.equals(topK, that.topK)
-				&& Objects.equals(topP, that.topP) && Objects.equals(tfsZ, that.tfsZ)
+				&& Objects.equals(numGPU, that.numGPU) && Objects.equals(mainGPU, that.mainGPU)
+				&& Objects.equals(lowVRAM, that.lowVRAM) && Objects.equals(f16KV, that.f16KV)
+				&& Objects.equals(logitsAll, that.logitsAll) && Objects.equals(vocabOnly, that.vocabOnly)
+				&& Objects.equals(useMMap, that.useMMap) && Objects.equals(useMLock, that.useMLock)
+				&& Objects.equals(numThread, that.numThread) && Objects.equals(numKeep, that.numKeep)
+				&& Objects.equals(seed, that.seed) && Objects.equals(numPredict, that.numPredict)
+				&& Objects.equals(topK, that.topK) && Objects.equals(topP, that.topP) && Objects.equals(tfsZ, that.tfsZ)
 				&& Objects.equals(typicalP, that.typicalP) && Objects.equals(repeatLastN, that.repeatLastN)
 				&& Objects.equals(temperature, that.temperature) && Objects.equals(repeatPenalty, that.repeatPenalty)
 				&& Objects.equals(presencePenalty, that.presencePenalty)
@@ -858,7 +837,7 @@ public boolean equals(Object o) {
 	@Override
 	public int hashCode() {
 		return Objects.hash(this.model, this.format, this.keepAlive, this.useNUMA, this.numCtx, this.numBatch,
-				this.numGQA, numGPU, mainGPU, lowVRAM, this.f16KV, this.logitsAll, this.vocabOnly, this.useMMap,
+				this.numGPU, this.mainGPU, lowVRAM, this.f16KV, this.logitsAll, this.vocabOnly, this.useMMap,
 				this.useMLock, this.numThread, this.numKeep, this.seed, this.numPredict, this.topK, this.topP, tfsZ,
 				this.typicalP, this.repeatLastN, this.temperature, this.repeatPenalty, this.presencePenalty,
 				this.frequencyPenalty, this.mirostat, this.mirostatTau, this.mirostatEta, this.penalizeNewline,
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/ollama-chat.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/ollama-chat.adoc
@@ -75,7 +75,6 @@ The remaining `options` properties are based on the link:https://github.com/olla
 | spring.ai.ollama.chat.options.numa              | Whether to use NUMA.                                           | false
 | spring.ai.ollama.chat.options.num-ctx           | Sets the size of the context window used to generate the next token. | 2048
 | spring.ai.ollama.chat.options.num-batch         | ???                                                             | 512
-| spring.ai.ollama.chat.options.num-gqa           | The number of GQA groups in the transformer layer. Required for some models, for example, it is 8 for llama2:70b. | 1
 | spring.ai.ollama.chat.options.num-gpu           | The number of layers to send to the GPU(s). On macOS it defaults to 1 to enable metal support, 0 to disable. 1 here indicates that NumGPU should be set dynamically | -1
 | spring.ai.ollama.chat.options.main-gpu          | ???                                                             | -
 | spring.ai.ollama.chat.options.low-vram          | ???                                                             | false
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/embeddings/ollama-embeddings.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/embeddings/ollama-embeddings.adoc
@@ -78,7 +78,6 @@ The remaining `options` properties are based on the link:https://github.com/olla
 | spring.ai.ollama.embedding.options.numa              | Whether to use NUMA.                                           | false
 | spring.ai.ollama.embedding.options.num-ctx           | Sets the size of the context window used to generate the next token. | 2048
 | spring.ai.ollama.embedding.options.num-batch         | ???                                                             | -
-| spring.ai.ollama.embedding.options.num-gqa           | The number of GQA groups in the transformer layer. Required for some models, for example, it is 8 for llama2:70b. | -
 | spring.ai.ollama.embedding.options.num-gpu           | The number of layers to send to the GPU(s). On macOS it defaults to 1 to enable metal support, 0 to disable. | -
 | spring.ai.ollama.embedding.options.main-gpu          | ???                                                             | -
 | spring.ai.ollama.embedding.options.low-vram          | ???                                                             | -