Skip to content

Commit 4f1f920

Browse files
committed
Merge branch 'main' into v4
2 parents 117d2bb + 8337acc commit 4f1f920

File tree

11 files changed

+130
-34
lines changed

11 files changed

+130
-34
lines changed

.github/workflows/publish.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ jobs:
1010
# Setup .npmrc file to publish to npm
1111
- uses: actions/setup-node@v3
1212
with:
13-
node-version: 'latest'
13+
node-version: '24'
1414
registry-url: 'https://registry.npmjs.org'
1515
- run: npm ci
1616
- run: npm run build

src/base/image_processors_utils.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -975,7 +975,7 @@ export class ImageProcessor extends Callable {
975975

976976
let image_std = this.image_std;
977977
if (!Array.isArray(this.image_std)) {
978-
image_std = new Array(image.channels).fill(image_mean);
978+
image_std = new Array(image.channels).fill(image_std);
979979
}
980980

981981
if (image_mean.length !== image.channels || image_std.length !== image.channels) {

src/models.js

Lines changed: 105 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6132,16 +6132,14 @@ export class SamModel extends SamPreTrainedModel {
61326132
// Compute the image embeddings if they are missing
61336133
model_inputs = {
61346134
...model_inputs,
6135-
...(await this.get_image_embeddings(model_inputs)),
6136-
};
6135+
...(await this.get_image_embeddings(model_inputs))
6136+
}
6137+
} else {
6138+
model_inputs = { ...model_inputs };
61376139
}
61386140

6139-
if (!model_inputs.input_labels && model_inputs.input_points) {
6140-
// Set default input labels if they are missing
6141-
const shape = model_inputs.input_points.dims.slice(0, -1);
6142-
const numElements = shape.reduce((a, b) => a * b, 1);
6143-
model_inputs.input_labels = new Tensor('int64', new BigInt64Array(numElements).fill(1n), shape);
6144-
}
6141+
// Set default input labels if they are missing
6142+
model_inputs.input_labels ??= ones(model_inputs.input_points.dims.slice(0, -1));
61456143

61466144
const decoder_inputs = {
61476145
image_embeddings: model_inputs.image_embeddings,
@@ -6190,6 +6188,101 @@ export class SamImageSegmentationOutput extends ModelOutput {
61906188
}
61916189
//////////////////////////////////////////////////
61926190

6191+
//////////////////////////////////////////////////
6192+
export class Sam2ImageSegmentationOutput extends ModelOutput {
6193+
/**
6194+
* @param {Object} output The output of the model.
6195+
* @param {Tensor} output.iou_scores The output logits of the model.
6196+
* @param {Tensor} output.pred_masks Predicted boxes.
6197+
* @param {Tensor} output.object_score_logits Logits for the object score, indicating if an object is present.
6198+
*/
6199+
constructor({ iou_scores, pred_masks, object_score_logits }) {
6200+
super();
6201+
this.iou_scores = iou_scores;
6202+
this.pred_masks = pred_masks;
6203+
this.object_score_logits = object_score_logits;
6204+
}
6205+
}
6206+
6207+
export class EdgeTamPreTrainedModel extends PreTrainedModel { }
6208+
6209+
/**
6210+
* EdgeTAM for generating segmentation masks, given an input image
6211+
* and optional 2D location and bounding boxes.
6212+
*/
6213+
export class EdgeTamModel extends EdgeTamPreTrainedModel {
6214+
6215+
/**
6216+
* Compute image embeddings and positional image embeddings, given the pixel values of an image.
6217+
* @param {Object} model_inputs Object containing the model inputs.
6218+
* @param {Tensor} model_inputs.pixel_values Pixel values obtained using a `Sam2Processor`.
6219+
* @returns {Promise<Record<String, Tensor>>} The image embeddings.
6220+
*/
6221+
async get_image_embeddings({ pixel_values }) {
6222+
// in:
6223+
// - pixel_values: tensor.float32[batch_size,3,1024,1024]
6224+
//
6225+
// out:
6226+
// - image_embeddings.0: tensor.float32[batch_size,32,256,256]
6227+
// - image_embeddings.1: tensor.float32[batch_size,64,128,128]
6228+
// - image_embeddings.2: tensor.float32[batch_size,256,64,64]
6229+
return await encoderForward(this, { pixel_values });
6230+
}
6231+
6232+
async forward(model_inputs) {
6233+
// @ts-expect-error ts(2339)
6234+
const { num_feature_levels } = this.config.vision_config;
6235+
const image_embeddings_name = Array.from({ length: num_feature_levels }, (_, i) => `image_embeddings.${i}`);
6236+
6237+
if (image_embeddings_name.some(name => !model_inputs[name])) {
6238+
// Compute the image embeddings if they are missing
6239+
model_inputs = {
6240+
...model_inputs,
6241+
...(await this.get_image_embeddings(model_inputs))
6242+
}
6243+
} else {
6244+
model_inputs = { ...model_inputs };
6245+
}
6246+
6247+
if (model_inputs.input_points) {
6248+
if (model_inputs.input_boxes && model_inputs.input_boxes.dims[1] !== 1) {
6249+
throw new Error('When both `input_points` and `input_boxes` are provided, the number of boxes per image must be 1.');
6250+
}
6251+
const shape = model_inputs.input_points.dims;
6252+
model_inputs.input_labels ??= ones(shape.slice(0, -1));
6253+
model_inputs.input_boxes ??= full([shape[0], 0, 4], 0.0);
6254+
6255+
} else if (model_inputs.input_boxes) { // only boxes
6256+
const shape = model_inputs.input_boxes.dims;
6257+
model_inputs.input_labels = full([shape[0], shape[1], 0], -1n);
6258+
model_inputs.input_points = full([shape[0], 1, 0, 2], 0.0);
6259+
6260+
} else {
6261+
throw new Error('At least one of `input_points` or `input_boxes` must be provided.');
6262+
}
6263+
6264+
const prompt_encoder_mask_decoder_session = this.sessions['prompt_encoder_mask_decoder'];
6265+
const decoder_inputs = pick(model_inputs, prompt_encoder_mask_decoder_session.inputNames);
6266+
6267+
// Returns:
6268+
// - iou_scores: tensor.float32[batch_size,num_boxes_or_points,3]
6269+
// - pred_masks: tensor.float32[batch_size,num_boxes_or_points,3,256,256]
6270+
// - object_score_logits: tensor.float32[batch_size,num_boxes_or_points,1]
6271+
return await sessionRun(prompt_encoder_mask_decoder_session, decoder_inputs);
6272+
}
6273+
6274+
/**
6275+
* Runs the model with the provided inputs
6276+
* @param {Object} model_inputs Model inputs
6277+
* @returns {Promise<Sam2ImageSegmentationOutput>} Object containing segmentation outputs
6278+
*/
6279+
async _call(model_inputs) {
6280+
return new Sam2ImageSegmentationOutput(await super._call(model_inputs));
6281+
}
6282+
}
6283+
//////////////////////////////////////////////////
6284+
6285+
61936286
//////////////////////////////////////////////////
61946287
// MarianMT models
61956288
export class MarianPreTrainedModel extends PreTrainedModel {}
@@ -8384,7 +8477,10 @@ const MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = new Map([
83848477
['maskformer', ['MaskFormerForInstanceSegmentation', MaskFormerForInstanceSegmentation]],
83858478
]);
83868479

8387-
const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([['sam', ['SamModel', SamModel]]]);
8480+
const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([
8481+
['sam', ['SamModel', SamModel]],
8482+
['edgetam', ['EdgeTamModel', EdgeTamModel]],
8483+
]);
83888484

83898485
const MODEL_FOR_CTC_MAPPING_NAMES = new Map([
83908486
['wav2vec2', ['Wav2Vec2ForCTC', Wav2Vec2ForCTC]],

src/models/image_processors.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ export * from './pvt/image_processing_pvt.js';
3030
export * from './qwen2_vl/image_processing_qwen2_vl.js';
3131
export * from './rt_detr/image_processing_rt_detr.js';
3232
export * from './sam/image_processing_sam.js';
33+
export * from './sam2/image_processing_sam2.js';
3334
export * from './segformer/image_processing_segformer.js';
3435
export * from './siglip/image_processing_siglip.js';
3536
export * from './smolvlm/image_processing_smolvlm.js';

src/models/processors.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ export * from './paligemma/processing_paligemma.js';
1414
export * from './pyannote/processing_pyannote.js';
1515
export * from './qwen2_vl/processing_qwen2_vl.js';
1616
export * from './sam/processing_sam.js';
17+
export * from './sam2/processing_sam2.js';
1718
export * from './smolvlm/processing_smolvlm.js';
1819
export * from './speecht5/processing_speecht5.js';
1920
export * from './ultravox/processing_ultravox.js';

src/models/sam/image_processing_sam.js

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,14 @@ export class SamImageProcessor extends ImageProcessor {
4040
}
4141

4242
// Reshape input points
43-
for (let i = 0; i < input_points.length; ++i) {
44-
// batch_size
45-
let originalImageSize = original_sizes[i];
46-
let reshapedImageSize = reshaped_input_sizes[i];
43+
for (let i = 0; i < input_points.length; ++i) { // batch_size
44+
const [originalHeight, originalWidth] = original_sizes[i];
45+
const [reshapedHeight, reshapedWidth] = reshaped_input_sizes[i];
4746

48-
let resizeFactors = [
49-
reshapedImageSize[0] / originalImageSize[0],
50-
reshapedImageSize[1] / originalImageSize[1],
51-
];
47+
const resizeFactors = [
48+
reshapedWidth / originalWidth,
49+
reshapedHeight / originalHeight,
50+
]
5251

5352
for (let j = 0; j < input_points[i].length; ++j) {
5453
// point_batch_size
@@ -163,7 +162,7 @@ export class SamImageProcessor extends ImageProcessor {
163162

164163
const output_masks = [];
165164

166-
pad_size = pad_size ?? this.pad_size;
165+
pad_size = pad_size ?? this.pad_size ?? this.size;
167166

168167
/** @type {[number, number]} */
169168
const target_image_size = [pad_size.height, pad_size.width];
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
2+
export { SamImageProcessor as Sam2ImageProcessor } from '../sam/image_processing_sam.js';

src/models/sam2/processing_sam2.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import { SamProcessor } from "../sam/processing_sam.js";
2+
3+
export class Sam2VideoProcessor extends SamProcessor { }

src/tokenizers.js

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2238,15 +2238,13 @@ class VitsDecoder extends Decoder {
22382238
class MetaspacePreTokenizer extends PreTokenizer {
22392239
/**
22402240
* @param {Object} config The configuration object for the MetaspacePreTokenizer.
2241-
* @param {boolean} config.add_prefix_space Whether to add a prefix space to the first token.
22422241
* @param {string} config.replacement The character to replace spaces with.
22432242
* @param {string} [config.str_rep=config.replacement] An optional string representation of the replacement character.
22442243
* @param {'first'|'never'|'always'} [config.prepend_scheme='always'] The metaspace prepending scheme.
22452244
*/
22462245
constructor(config) {
22472246
super();
22482247

2249-
this.addPrefixSpace = config.add_prefix_space;
22502248
this.replacement = config.replacement;
22512249
this.strRep = config.str_rep || this.replacement;
22522250
this.prepend_scheme = config.prepend_scheme ?? 'always';
@@ -2265,10 +2263,9 @@ class MetaspacePreTokenizer extends PreTokenizer {
22652263

22662264
if (
22672265
// We add a prefix space if:
2268-
// (1) The addPrefixSpace option is enabled and the normalized
2269-
// token does not already start with the replacement character.
2270-
this.addPrefixSpace &&
2271-
!normalized.startsWith(this.replacement) &&
2266+
// (1) The normalized token does not already start with the replacement character.
2267+
!normalized.startsWith(this.replacement)
2268+
22722269
// and (2) either:
22732270
// (a) prepend_scheme is 'always'
22742271
// (b) prepend_scheme is 'first' and this is the first section
@@ -2288,13 +2285,11 @@ class MetaspaceDecoder extends Decoder {
22882285
/**
22892286
* Constructs a new MetaspaceDecoder object.
22902287
* @param {Object} config The configuration object for the MetaspaceDecoder.
2291-
* @param {boolean} config.add_prefix_space Whether to add a prefix space to the decoded string.
22922288
* @param {string} config.replacement The string to replace spaces with.
22932289
*/
22942290
constructor(config) {
22952291
super(config);
22962292

2297-
this.addPrefixSpace = config.add_prefix_space;
22982293
this.replacement = config.replacement;
22992294
}
23002295

@@ -2303,7 +2298,7 @@ class MetaspaceDecoder extends Decoder {
23032298
const result = [];
23042299
for (let i = 0; i < tokens.length; ++i) {
23052300
let normalized = tokens[i].replaceAll(this.replacement, ' ');
2306-
if (this.addPrefixSpace && i == 0 && normalized.startsWith(' ')) {
2301+
if (i == 0 && normalized.startsWith(' ')) {
23072302
normalized = normalized.substring(1);
23082303
}
23092304
result.push(normalized);
@@ -3357,8 +3352,7 @@ export class LlamaTokenizer extends PreTrainedTokenizer {
33573352
this.normalizer = null;
33583353
this.pre_tokenizer = new MetaspacePreTokenizer({
33593354
replacement: SPIECE_UNDERLINE,
3360-
add_prefix_space: true,
3361-
prepend_scheme: 'first',
3355+
prepend_scheme: "first",
33623356
});
33633357
}
33643358
}

tests/models/jina_clip/test_processor_jina_clip.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ export default () => {
3333
// Encode text and images
3434
const { input_ids, attention_mask, pixel_values } = await processor(sentences, images, { padding: true, truncation: true });
3535

36-
expect(input_ids.dims).toEqual([sentences.length, 19]);
37-
expect(attention_mask.dims).toEqual([sentences.length, 19]);
36+
expect(input_ids.dims).toEqual([sentences.length, 14]);
37+
expect(attention_mask.dims).toEqual([sentences.length, 14]);
3838
expect(pixel_values.dims).toEqual([images.length, 3, 512, 512]);
3939
expect(pixel_values.mean().item()).toBeCloseTo(0.7857685685157776, 6);
4040
},

0 commit comments

Comments
 (0)