test: fix tests

giladgd · giladgd · commit bb33a5de58dd · 2025-01-06T02:04:31.000+02:00
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
@@ -326,6 +326,8 @@ export type SequenceEvaluateMetadataOptions = {
      * Same as `probabilities.get(token)` from the output.
      *
      * If you need only this value, you can skip getting the full probabilities list to improve performance.
+     *
+     * This value might be slightly different when evaluated on different GPUs and configurations.
      */
     readonly confidence?: boolean,
 
@@ -359,6 +361,8 @@ export type SequenceEvaluateOutput<
      * Same as `probabilities.get(token)`.
      *
      * If you need only this value, you can skip getting the full probabilities list to improve performance.
+     *
+     * This value might be slightly different when evaluated on different GPUs and configurations.
      */
     confidence: number,
 
@@ -367,6 +371,8 @@ export type SequenceEvaluateOutput<
      *
      * A probability is a number from `0` to `1`.
      *
+     * The probabilities might be slightly different when evaluated on different GPUs and configurations.
+     *
      * The map is sorted by the probability of the tokens from the highest to the lowest,
      * and is reflected in the order of the entries when iterating over the map.
      * Use `.entries().next().value` to get the top probability pair
@@ -392,6 +398,8 @@ export type ControlledEvaluateInputItem = Token | [token: Token, options: {
          * Same as `next.probabilities.get(next.token)` from the output.
          *
          * If you need only this value, you can skip getting the full probabilities list to improve performance.
+         *
+         * This value might be slightly different when evaluated on different GPUs and configurations.
          */
         confidence?: boolean,
 
@@ -437,6 +445,8 @@ export type ControlledEvaluateIndexOutput = {
          * Same as `next.probabilities.get(next.token)`.
          *
          * If you need only this value, you can skip getting the full probabilities list to improve performance.
+         *
+         * This value might be slightly different when evaluated on different GPUs and configurations.
          */
         confidence?: number,
 
@@ -445,6 +455,8 @@ export type ControlledEvaluateIndexOutput = {
          *
          * A probability is a number from `0` to `1`.
          *
+         * The probabilities might be slightly different when evaluated on different GPUs and configurations.
+         *
          * The map is sorted by the probability of the tokens from the highest to the lowest,
          * and is reflected in the order of the entries when iterating over the map.
          * Use `.entries().next().value` to get the top probability pair
diff --git a/test/modelDependent/llama3.1/controlledEvaluate.test.ts b/test/modelDependent/llama3.1/controlledEvaluate.test.ts
@@ -5,10 +5,14 @@ import {getTestLlama} from "../../utils/getTestLlama.js";
 
 describe("llama 3.1", () => {
     describe("controlled evaluate", () => {
-        test("get probabilities for 3 tokens", {timeout: 1000 * 60 * 60 * 2}, async () => {
+        test("get probabilities for 3 tokens", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
             const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
             const llama = await getTestLlama();
 
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
             const model = await llama.loadModel({
                 modelPath
             });
diff --git a/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts b/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts
@@ -67,10 +67,14 @@ describe("llama 3.1", () => {
             `);
         });
 
-        test("with probabilities", {timeout: 1000 * 60 * 60 * 2}, async () => {
+        test("with probabilities", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
             const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
             const llama = await getTestLlama();
 
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
             const model = await llama.loadModel({
                 modelPath
             });
@@ -246,10 +250,14 @@ describe("llama 3.1", () => {
             `);
         });
 
-        test("with confidence", {timeout: 1000 * 60 * 60 * 2}, async () => {
+        test("with confidence", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
             const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
             const llama = await getTestLlama();
 
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
             const model = await llama.loadModel({
                 modelPath
             });
@@ -315,10 +323,14 @@ describe("llama 3.1", () => {
             `);
         });
 
-        test("with probabilities and confidence", {timeout: 1000 * 60 * 60 * 2}, async () => {
+        test("with probabilities and confidence", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
             const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
             const llama = await getTestLlama();
 
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
             const model = await llama.loadModel({
                 modelPath
             });
@@ -504,10 +516,14 @@ describe("llama 3.1", () => {
             `);
         });
 
-        test("confidence alone matches probability alone", {timeout: 1000 * 60 * 60 * 2}, async () => {
+        test("confidence alone matches probability alone", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
             const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
             const llama = await getTestLlama();
 
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
             const model = await llama.loadModel({
                 modelPath
             });