huggingface
diff --git a/‎src/utils/tensor.js
Lines changed: 14 additions & 0 deletions b/‎src/utils/tensor.js
Lines changed: 14 additions & 0 deletions
diff --git a/‎tests/asset_cache.js
Lines changed: 27 additions & 0 deletions b/‎tests/asset_cache.js
Lines changed: 27 additions & 0 deletions
diff --git a/‎tests/feature_extractors.test.js
Lines changed: 5 additions & 0 deletions b/‎tests/feature_extractors.test.js
Lines changed: 5 additions & 0 deletions
diff --git a/‎tests/image_processors.test.js
Lines changed: 5 additions & 0 deletions b/‎tests/image_processors.test.js
Lines changed: 5 additions & 0 deletions
diff --git a/‎tests/init.js
Lines changed: 4 additions & 2 deletions b/‎tests/init.js
Lines changed: 4 additions & 2 deletions
diff --git a/‎tests/models.test.js
Lines changed: 4 additions & 13 deletions b/‎tests/models.test.js
Lines changed: 4 additions & 13 deletions
diff --git a/‎tests/models/all_modeling_tests.js
Lines changed: 0 additions & 33 deletions b/‎tests/models/all_modeling_tests.js
Lines changed: 0 additions & 33 deletions
diff --git a/‎tests/models/all_tokenization_tests.js
Lines changed: 0 additions & 22 deletions b/‎tests/models/all_tokenization_tests.js
Lines changed: 0 additions & 22 deletions
diff --git a/‎tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.js
Lines changed: 53 additions & 0 deletions b/‎tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.js
Lines changed: 53 additions & 0 deletions
diff --git a/‎tests/models/clap/test_feature_extraction_clap.js
Lines changed: 74 additions & 0 deletions b/‎tests/models/clap/test_feature_extraction_clap.js
Lines changed: 74 additions & 0 deletions
@@ -1430,6 +1430,20 @@ export function zeros_like(tensor) {
     return zeros(tensor.dims);
 }
 
+/**
+ * Returns a tensor filled with random numbers from a uniform distribution on the interval [0, 1)
+ * @param {number[]} size A sequence of integers defining the shape of the output tensor.
+ * @returns {Tensor} The random tensor.
+ */
+export function rand(size) {
+    const length = size.reduce((a, b) => a * b, 1);
+    return new Tensor(
+        "float32",
+        Float32Array.from({ length }, () => Math.random()),
+        size,
+    )
+}
+
 /**
  * Quantizes the embeddings tensor to binary or unsigned binary precision.
  * @param {Tensor} tensor The tensor to quantize.
 
@@ -3,6 +3,7 @@ import { RawImage } from "../src/transformers.js";
 const BASE_URL = "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/";
 const TEST_IMAGES = Object.freeze({
   white_image: BASE_URL + "white-image.png",
+  blue_image: BASE_URL + "blue-image.png",
   pattern_3x3: BASE_URL + "pattern_3x3.png",
   pattern_3x5: BASE_URL + "pattern_3x5.png",
   checkerboard_8x8: BASE_URL + "checkerboard_8x8.png",
@@ -21,8 +22,14 @@ const TEST_IMAGES = Object.freeze({
 
   beetle: BASE_URL + "beetle.png",
   book_cover: BASE_URL + "book-cover.png",
+  corgi: BASE_URL + "corgi.jpg",
+  man_on_car: BASE_URL + "young-man-standing-and-leaning-on-car.jpg",
 });
 
+const TEST_AUDIOS = {
+  mlk: BASE_URL + "mlk.npy",
+};
+
 /** @type {Map<string, RawImage>} */
 const IMAGE_CACHE = new Map();
 const load_image = async (url) => {
@@ -35,9 +42,29 @@ const load_image = async (url) => {
   return image;
 };
 
+/** @type {Map<string, any>} */
+const AUDIO_CACHE = new Map();
+const load_audio = async (url) => {
+  const cached = AUDIO_CACHE.get(url);
+  if (cached) {
+    return cached;
+  }
+  const buffer = await (await fetch(url)).arrayBuffer();
+  const audio = Float32Array.from(new Float64Array(buffer));
+  AUDIO_CACHE.set(url, audio);
+  return audio;
+};
+
 /**
  * Load a cached image.
  * @param {keyof typeof TEST_IMAGES} name The name of the image to load.
  * @returns {Promise<RawImage>} The loaded image.
  */
 export const load_cached_image = (name) => load_image(TEST_IMAGES[name]);
+
+/**
+ * Load a cached audio.
+ * @param {keyof typeof TEST_AUDIOS} name The name of the audio to load.
+ * @returns {Promise<Float32Array>} The loaded audio.
+ */
+export const load_cached_audio = (name) => load_audio(TEST_AUDIOS[name]);
@@ -0,0 +1,5 @@
+import { init } from "./init.js";
+import { collect_and_execute_tests } from "./test_utils.js";
+
+init();
+await collect_and_execute_tests("Feature extractors", "feature_extraction");
@@ -0,0 +1,5 @@
+import { init } from "./init.js";
+import { collect_and_execute_tests } from "./test_utils.js";
+
+init();
+await collect_and_execute_tests("Image processors", "image_processing");
@@ -57,16 +57,18 @@ export function init() {
   registerBackend("test", onnxruntimeBackend, Number.POSITIVE_INFINITY);
 }
 
+export const MAX_TOKENIZER_LOAD_TIME = 10_000; // 10 seconds
+export const MAX_FEATURE_EXTRACTOR_LOAD_TIME = 10_000; // 10 seconds
 export const MAX_PROCESSOR_LOAD_TIME = 10_000; // 10 seconds
 export const MAX_MODEL_LOAD_TIME = 15_000; // 15 seconds
 export const MAX_TEST_EXECUTION_TIME = 60_000; // 60 seconds
 export const MAX_MODEL_DISPOSE_TIME = 1_000; // 1 second
 
 export const MAX_TEST_TIME = MAX_MODEL_LOAD_TIME + MAX_TEST_EXECUTION_TIME + MAX_MODEL_DISPOSE_TIME;
 
-export const DEFAULT_MODEL_OPTIONS = {
+export const DEFAULT_MODEL_OPTIONS = Object.freeze({
   dtype: "fp32",
-};
+});
 
 expect.extend({
   toBeCloseToNested(received, expected, numDigits = 2) {
 
@@ -2,13 +2,9 @@
  * Test that models loaded outside of the `pipeline` function work correctly (e.g., `AutoModel.from_pretrained(...)`);
  */
 
-import * as MODEL_TESTS from "./models/all_modeling_tests.js";
-
 import { AutoTokenizer, AutoModel, BertModel, GPT2Model, T5ForConditionalGeneration, BertTokenizer, GPT2Tokenizer, T5Tokenizer } from "../src/transformers.js";
-
-import { init, MAX_TEST_EXECUTION_TIME } from "./init.js";
-
-import { compare } from "./test_utils.js";
+import { init, MAX_TEST_EXECUTION_TIME, DEFAULT_MODEL_OPTIONS } from "./init.js";
+import { compare, collect_and_execute_tests } from "./test_utils.js";
 
 // Initialise the testing environment
 init();
@@ -38,7 +34,7 @@ describe("Loading different architecture types", () => {
         async () => {
           // Load model and tokenizer
           const tokenizer = await tokenizerClassToTest.from_pretrained(model_id);
-          const model = await modelClassToTest.from_pretrained(model_id, { dtype: "fp32" });
+          const model = await modelClassToTest.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS);
 
           const tests = [
             texts[0], // single
@@ -65,7 +61,6 @@ describe("Loading different architecture types", () => {
               throw new Error("Unexpected output");
             }
           }
-
           await model.dispose();
         },
         MAX_TEST_EXECUTION_TIME,
@@ -74,8 +69,4 @@ describe("Loading different architecture types", () => {
   }
 });
 
-describe("Model-specific tests", () => {
-  for (const [modelName, modelTest] of Object.entries(MODEL_TESTS)) {
-    describe(modelName, modelTest);
-  }
-});
+await collect_and_execute_tests("Model-specific tests", "modeling");
@@ -0,0 +1,53 @@
+import { AutoFeatureExtractor, ASTFeatureExtractor } from "../../../src/transformers.js";
+
+import { load_cached_audio } from "../../asset_cache.js";
+import { MAX_FEATURE_EXTRACTOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+export default () => {
+  // ASTFeatureExtractor
+  describe("ASTFeatureExtractor", () => {
+    const model_id = "Xenova/ast-finetuned-audioset-10-10-0.4593";
+
+    /** @type {ASTFeatureExtractor} */
+    let feature_extractor;
+    beforeAll(async () => {
+      feature_extractor = await AutoFeatureExtractor.from_pretrained(model_id);
+    }, MAX_FEATURE_EXTRACTOR_LOAD_TIME);
+
+    it(
+      "truncation",
+      async () => {
+        const audio = await load_cached_audio("mlk");
+        const { input_values } = await feature_extractor(audio);
+        expect(input_values.dims).toEqual([1, 1024, 128]);
+
+        expect(input_values.mean().item()).toBeCloseTo(-0.04054912979309085);
+        expect(input_values.data[0]).toBeCloseTo(-0.5662586092948914);
+        expect(input_values.data[1]).toBeCloseTo(-1.0300861597061157);
+        expect(input_values.data[129]).toBeCloseTo(-1.084834098815918);
+        expect(input_values.data[1025]).toBeCloseTo(-1.1204065084457397);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "padding",
+      async () => {
+        const audio = await load_cached_audio("mlk");
+        const { input_values } = await feature_extractor(audio.slice(0, 1000));
+        expect(input_values.dims).toEqual([1, 1024, 128]); // [1, 4, 128] -> (padded to) -> [1, 1024, 128]
+
+        expect(input_values.mean().item()).toBeCloseTo(0.4647964835166931);
+        expect(input_values.data[0]).toBeCloseTo(-0.5662586092948914);
+        expect(input_values.data[1]).toBeCloseTo(-1.0300861597061157);
+        expect(input_values.data[129]).toBeCloseTo(-1.084834098815918);
+
+        // padded values
+        expect(input_values.data[1025]).toBeCloseTo(0.46703237295150757);
+        expect(input_values.data[2049]).toBeCloseTo(0.46703237295150757);
+        expect(input_values.data[10000]).toBeCloseTo(0.46703237295150757);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};
@@ -0,0 +1,74 @@
+import { AutoFeatureExtractor, ClapFeatureExtractor } from "../../../src/transformers.js";
+
+import { load_cached_audio } from "../../asset_cache.js";
+import { MAX_FEATURE_EXTRACTOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+export default () => {
+  // ClapFeatureExtractor
+  describe("ClapFeatureExtractor", () => {
+    const model_id = "Xenova/clap-htsat-unfused";
+
+    /** @type {ClapFeatureExtractor} */
+    let feature_extractor;
+    beforeAll(async () => {
+      feature_extractor = await AutoFeatureExtractor.from_pretrained(model_id);
+    }, MAX_FEATURE_EXTRACTOR_LOAD_TIME);
+
+    it(
+      "truncation",
+      async () => {
+        const audio = await load_cached_audio("mlk");
+
+        // Since truncation uses a random strategy, we override
+        // Math.random to ensure that the test is deterministic
+        const originalRandom = Math.random;
+        Math.random = () => 0.5;
+
+        let long_audio = new Float32Array(500000);
+        long_audio.set(audio);
+        long_audio.set(audio, long_audio.length - audio.length);
+
+        const { input_features } = await feature_extractor(long_audio);
+        const { dims, data } = input_features;
+        expect(dims).toEqual([1, 1, 1001, 64]);
+
+        expect(input_features.mean().item()).toBeCloseTo(-37.94569396972656);
+        expect(data[0]).toBeCloseTo(-53.32647705078125);
+        expect(data[1]).toBeCloseTo(-47.76755142211914);
+        expect(data[65]).toBeCloseTo(-36.32261276245117);
+        expect(data[1002]).toBeCloseTo(-28.0314884185791);
+        expect(data[10000]).toBeCloseTo(-21.905902862548828);
+        expect(data[60000]).toBeCloseTo(-14.877863883972168);
+        expect(data[64062]).toBeCloseTo(-37.9784049987793);
+        expect(data[64063]).toBeCloseTo(-37.73963928222656);
+
+        // Reset Math.random
+        Math.random = originalRandom;
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "padding",
+      async () => {
+        const audio = await load_cached_audio("mlk");
+        const { input_features } = await feature_extractor(audio);
+        const { data, dims } = input_features;
+        expect(dims).toEqual([1, 1, 1001, 64]);
+
+        expect(input_features.mean().item()).toBeCloseTo(-34.99049377441406);
+        expect(data[0]).toBeCloseTo(-21.32573890686035);
+        expect(data[1]).toBeCloseTo(-26.168411254882812);
+        expect(data[65]).toBeCloseTo(-29.716018676757812);
+        expect(data[1002]).toBeCloseTo(-32.16273498535156);
+        expect(data[10000]).toBeCloseTo(-19.9283390045166);
+
+        // padded values
+        expect(data[60000]).toBeCloseTo(-100.0);
+        expect(data[64062]).toBeCloseTo(-100.0);
+        expect(data[64063]).toBeCloseTo(-100.0);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};