Merge branch 'main' into v4

xenova · xenova · commit 4f1f920fee43 · 2025-11-13T22:14:36.000-05:00
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -10,7 +10,7 @@ jobs:
       # Setup .npmrc file to publish to npm
       - uses: actions/setup-node@v3
         with:
-          node-version: 'latest'
+          node-version: '24'
           registry-url: 'https://registry.npmjs.org'
       - run: npm ci
       - run: npm run build
diff --git a/src/base/image_processors_utils.js b/src/base/image_processors_utils.js
@@ -975,7 +975,7 @@ export class ImageProcessor extends Callable {
 
             let image_std = this.image_std;
             if (!Array.isArray(this.image_std)) {
-                image_std = new Array(image.channels).fill(image_mean);
+                image_std = new Array(image.channels).fill(image_std);
             }
 
             if (image_mean.length !== image.channels || image_std.length !== image.channels) {
diff --git a/src/models.js b/src/models.js
@@ -6132,16 +6132,14 @@ export class SamModel extends SamPreTrainedModel {
             // Compute the image embeddings if they are missing
             model_inputs = {
                 ...model_inputs,
-                ...(await this.get_image_embeddings(model_inputs)),
-            };
+                ...(await this.get_image_embeddings(model_inputs))
+            }
+        } else {
+            model_inputs = { ...model_inputs };
         }
 
-        if (!model_inputs.input_labels && model_inputs.input_points) {
-            // Set default input labels if they are missing
-            const shape = model_inputs.input_points.dims.slice(0, -1);
-            const numElements = shape.reduce((a, b) => a * b, 1);
-            model_inputs.input_labels = new Tensor('int64', new BigInt64Array(numElements).fill(1n), shape);
-        }
+        // Set default input labels if they are missing
+        model_inputs.input_labels ??= ones(model_inputs.input_points.dims.slice(0, -1));
 
         const decoder_inputs = {
             image_embeddings: model_inputs.image_embeddings,
@@ -6190,6 +6188,101 @@ export class SamImageSegmentationOutput extends ModelOutput {
 }
 //////////////////////////////////////////////////
 
+//////////////////////////////////////////////////
+export class Sam2ImageSegmentationOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.iou_scores The output logits of the model.
+     * @param {Tensor} output.pred_masks Predicted boxes.
+     * @param {Tensor} output.object_score_logits Logits for the object score, indicating if an object is present.
+     */
+    constructor({ iou_scores, pred_masks, object_score_logits }) {
+        super();
+        this.iou_scores = iou_scores;
+        this.pred_masks = pred_masks;
+        this.object_score_logits = object_score_logits;
+    }
+}
+
+export class EdgeTamPreTrainedModel extends PreTrainedModel { }
+
+/**
+ * EdgeTAM for generating segmentation masks, given an input image
+ * and optional 2D location and bounding boxes.
+ */
+export class EdgeTamModel extends EdgeTamPreTrainedModel {
+
+    /**
+     * Compute image embeddings and positional image embeddings, given the pixel values of an image.
+     * @param {Object} model_inputs Object containing the model inputs.
+     * @param {Tensor} model_inputs.pixel_values Pixel values obtained using a `Sam2Processor`.
+     * @returns {Promise<Record<String, Tensor>>} The image embeddings.
+     */
+    async get_image_embeddings({ pixel_values }) {
+        // in:
+        //  - pixel_values: tensor.float32[batch_size,3,1024,1024]
+        // 
+        // out:
+        //  - image_embeddings.0: tensor.float32[batch_size,32,256,256]
+        //  - image_embeddings.1: tensor.float32[batch_size,64,128,128]
+        //  - image_embeddings.2: tensor.float32[batch_size,256,64,64]
+        return await encoderForward(this, { pixel_values });
+    }
+
+    async forward(model_inputs) {
+        // @ts-expect-error ts(2339)
+        const { num_feature_levels } = this.config.vision_config;
+        const image_embeddings_name = Array.from({ length: num_feature_levels }, (_, i) => `image_embeddings.${i}`);
+
+        if (image_embeddings_name.some(name => !model_inputs[name])) {
+            // Compute the image embeddings if they are missing
+            model_inputs = {
+                ...model_inputs,
+                ...(await this.get_image_embeddings(model_inputs))
+            }
+        } else {
+            model_inputs = { ...model_inputs };
+        }
+
+        if (model_inputs.input_points) {
+            if (model_inputs.input_boxes && model_inputs.input_boxes.dims[1] !== 1) {
+                throw new Error('When both `input_points` and `input_boxes` are provided, the number of boxes per image must be 1.');
+            }
+            const shape = model_inputs.input_points.dims;
+            model_inputs.input_labels ??= ones(shape.slice(0, -1));
+            model_inputs.input_boxes ??= full([shape[0], 0, 4], 0.0);
+
+        } else if (model_inputs.input_boxes) { // only boxes
+            const shape = model_inputs.input_boxes.dims;
+            model_inputs.input_labels = full([shape[0], shape[1], 0], -1n);
+            model_inputs.input_points = full([shape[0], 1, 0, 2], 0.0);
+
+        } else {
+            throw new Error('At least one of `input_points` or `input_boxes` must be provided.');
+        }
+
+        const prompt_encoder_mask_decoder_session = this.sessions['prompt_encoder_mask_decoder'];
+        const decoder_inputs = pick(model_inputs, prompt_encoder_mask_decoder_session.inputNames);
+
+        // Returns:
+        //  - iou_scores: tensor.float32[batch_size,num_boxes_or_points,3]
+        //  - pred_masks: tensor.float32[batch_size,num_boxes_or_points,3,256,256]
+        //  - object_score_logits: tensor.float32[batch_size,num_boxes_or_points,1]
+        return await sessionRun(prompt_encoder_mask_decoder_session, decoder_inputs);
+    }
+
+    /**
+     * Runs the model with the provided inputs
+     * @param {Object} model_inputs Model inputs
+     * @returns {Promise<Sam2ImageSegmentationOutput>} Object containing segmentation outputs
+     */
+    async _call(model_inputs) {
+        return new Sam2ImageSegmentationOutput(await super._call(model_inputs));
+    }
+}
+//////////////////////////////////////////////////
+
+
 //////////////////////////////////////////////////
 // MarianMT models
 export class MarianPreTrainedModel extends PreTrainedModel {}
@@ -8384,7 +8477,10 @@ const MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = new Map([
     ['maskformer', ['MaskFormerForInstanceSegmentation', MaskFormerForInstanceSegmentation]],
 ]);
 
-const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([['sam', ['SamModel', SamModel]]]);
+const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([
+    ['sam', ['SamModel', SamModel]],
+    ['edgetam', ['EdgeTamModel', EdgeTamModel]],
+]);
 
 const MODEL_FOR_CTC_MAPPING_NAMES = new Map([
     ['wav2vec2', ['Wav2Vec2ForCTC', Wav2Vec2ForCTC]],
diff --git a/src/models/image_processors.js b/src/models/image_processors.js
@@ -30,6 +30,7 @@ export * from './pvt/image_processing_pvt.js';
 export * from './qwen2_vl/image_processing_qwen2_vl.js';
 export * from './rt_detr/image_processing_rt_detr.js';
 export * from './sam/image_processing_sam.js';
+export * from './sam2/image_processing_sam2.js';
 export * from './segformer/image_processing_segformer.js';
 export * from './siglip/image_processing_siglip.js';
 export * from './smolvlm/image_processing_smolvlm.js';
diff --git a/src/models/processors.js b/src/models/processors.js
@@ -14,6 +14,7 @@ export * from './paligemma/processing_paligemma.js';
 export * from './pyannote/processing_pyannote.js';
 export * from './qwen2_vl/processing_qwen2_vl.js';
 export * from './sam/processing_sam.js';
+export * from './sam2/processing_sam2.js';
 export * from './smolvlm/processing_smolvlm.js';
 export * from './speecht5/processing_speecht5.js';
 export * from './ultravox/processing_ultravox.js';
diff --git a/src/models/sam/image_processing_sam.js b/src/models/sam/image_processing_sam.js
@@ -40,15 +40,14 @@ export class SamImageProcessor extends ImageProcessor {
         }
 
         // Reshape input points
-        for (let i = 0; i < input_points.length; ++i) {
-            // batch_size
-            let originalImageSize = original_sizes[i];
-            let reshapedImageSize = reshaped_input_sizes[i];
+        for (let i = 0; i < input_points.length; ++i) { // batch_size
+            const [originalHeight, originalWidth] = original_sizes[i];
+            const [reshapedHeight, reshapedWidth] = reshaped_input_sizes[i];
 
-            let resizeFactors = [
-                reshapedImageSize[0] / originalImageSize[0],
-                reshapedImageSize[1] / originalImageSize[1],
-            ];
+            const resizeFactors = [
+                reshapedWidth / originalWidth,
+                reshapedHeight / originalHeight,
+            ]
 
             for (let j = 0; j < input_points[i].length; ++j) {
                 // point_batch_size
@@ -163,7 +162,7 @@ export class SamImageProcessor extends ImageProcessor {
 
         const output_masks = [];
 
-        pad_size = pad_size ?? this.pad_size;
+        pad_size = pad_size ?? this.pad_size ?? this.size;
 
         /** @type {[number, number]} */
         const target_image_size = [pad_size.height, pad_size.width];
diff --git a/src/models/sam2/image_processing_sam2.js b/src/models/sam2/image_processing_sam2.js
@@ -0,0 +1,2 @@
+
+export { SamImageProcessor as Sam2ImageProcessor } from '../sam/image_processing_sam.js';
diff --git a/src/models/sam2/processing_sam2.js b/src/models/sam2/processing_sam2.js
@@ -0,0 +1,3 @@
+import { SamProcessor } from "../sam/processing_sam.js";
+
+export class Sam2VideoProcessor extends SamProcessor { }
diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -2238,15 +2238,13 @@ class VitsDecoder extends Decoder {
 class MetaspacePreTokenizer extends PreTokenizer {
     /**
      * @param {Object} config The configuration object for the MetaspacePreTokenizer.
-     * @param {boolean} config.add_prefix_space Whether to add a prefix space to the first token.
      * @param {string} config.replacement The character to replace spaces with.
      * @param {string} [config.str_rep=config.replacement] An optional string representation of the replacement character.
      * @param {'first'|'never'|'always'} [config.prepend_scheme='always'] The metaspace prepending scheme.
      */
     constructor(config) {
         super();
 
-        this.addPrefixSpace = config.add_prefix_space;
         this.replacement = config.replacement;
         this.strRep = config.str_rep || this.replacement;
         this.prepend_scheme = config.prepend_scheme ?? 'always';
@@ -2265,10 +2263,9 @@ class MetaspacePreTokenizer extends PreTokenizer {
 
         if (
             // We add a prefix space if:
-            //  (1) The addPrefixSpace option is enabled and the normalized
-            //      token does not already start with the replacement character.
-            this.addPrefixSpace &&
-            !normalized.startsWith(this.replacement) &&
+            //  (1) The normalized token does not already start with the replacement character.
+            !normalized.startsWith(this.replacement)
+
             // and (2) either:
             //  (a) prepend_scheme is 'always'
             //  (b) prepend_scheme is 'first' and this is the first section
@@ -2288,13 +2285,11 @@ class MetaspaceDecoder extends Decoder {
     /**
      * Constructs a new MetaspaceDecoder object.
      * @param {Object} config The configuration object for the MetaspaceDecoder.
-     * @param {boolean} config.add_prefix_space Whether to add a prefix space to the decoded string.
      * @param {string} config.replacement The string to replace spaces with.
      */
     constructor(config) {
         super(config);
 
-        this.addPrefixSpace = config.add_prefix_space;
         this.replacement = config.replacement;
     }
 
@@ -2303,7 +2298,7 @@ class MetaspaceDecoder extends Decoder {
         const result = [];
         for (let i = 0; i < tokens.length; ++i) {
             let normalized = tokens[i].replaceAll(this.replacement, ' ');
-            if (this.addPrefixSpace && i == 0 && normalized.startsWith(' ')) {
+            if (i == 0 && normalized.startsWith(' ')) {
                 normalized = normalized.substring(1);
             }
             result.push(normalized);
@@ -3357,8 +3352,7 @@ export class LlamaTokenizer extends PreTrainedTokenizer {
             this.normalizer = null;
             this.pre_tokenizer = new MetaspacePreTokenizer({
                 replacement: SPIECE_UNDERLINE,
-                add_prefix_space: true,
-                prepend_scheme: 'first',
+                prepend_scheme: "first",
             });
         }
     }
diff --git a/tests/models/jina_clip/test_processor_jina_clip.js b/tests/models/jina_clip/test_processor_jina_clip.js
@@ -33,8 +33,8 @@ export default () => {
         // Encode text and images
         const { input_ids, attention_mask, pixel_values } = await processor(sentences, images, { padding: true, truncation: true });
 
-        expect(input_ids.dims).toEqual([sentences.length, 19]);
-        expect(attention_mask.dims).toEqual([sentences.length, 19]);
+        expect(input_ids.dims).toEqual([sentences.length, 14]);
+        expect(attention_mask.dims).toEqual([sentences.length, 14]);
         expect(pixel_values.dims).toEqual([images.length, 3, 512, 512]);
         expect(pixel_values.mean().item()).toBeCloseTo(0.7857685685157776, 6);
       },
diff --git a/tests/models/sam/test_modeling_sam.js b/tests/models/sam/test_modeling_sam.js
@@ -28,7 +28,7 @@ export default () => {
         const { pred_masks, iou_scores } = await model(inputs);
 
         expect(pred_masks.dims).toEqual([1, 1, 3, 256, 256]);
-        expect(pred_masks.mean().item()).toBeCloseTo(-5.769824981689453, 3);
+        expect(pred_masks.mean().item()).toBeCloseTo(-5.764908313751221, 3);
         expect(iou_scores.dims).toEqual([1, 1, 3]);
         expect(iou_scores.tolist()).toBeCloseToNested([[[0.8583833575248718, 0.9773167967796326, 0.8511142730712891]]]);
 

Original file line number	Diff line number	Diff line change
`@@ -975,7 +975,7 @@ export class ImageProcessor extends Callable {`
`975`	`975`
`976`	`976`	`let image_std = this.image_std;`
`977`	`977`	`if (!Array.isArray(this.image_std)) {`
`978`		`- image_std = new Array(image.channels).fill(image_mean);`
	`978`	`+ image_std = new Array(image.channels).fill(image_std);`
`979`	`979`	`}`
`980`	`980`
`981`	`981`	`if (image_mean.length !== image.channels \|\| image_std.length !== image.channels) {`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+`
	`2`	`+export { SamImageProcessor as Sam2ImageProcessor } from '../sam/image_processing_sam.js';`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+import { SamProcessor } from "../sam/processing_sam.js";`
	`2`	`+`
	`3`	`+export class Sam2VideoProcessor extends SamProcessor { }`