Skip to content

Commit 41a6139

Browse files
authored
Add support for visualizing self-attention heatmaps + sequence classifier outputs w/ attentions (#1117)
* Add support for nearest neighbour interpolation * Support bigint inputs for `min` and `max` helper functions * Add min/max/argmin/argmax tensor ops * Allow sequence classifier outputs with attentions * Align depth estimation pipeline w/ python version * Update `min` and `max` types * Fix qwen2vl processor * Update depth estimation pipeline unit tests * Remove old comments * Update types * Fix type issues
1 parent 3aa7c78 commit 41a6139

File tree

11 files changed

+121
-47
lines changed

11 files changed

+121
-47
lines changed

src/models.js

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4463,6 +4463,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
44634463
const image_nums = vision_tokens.filter(x => x == image_token_id).length;
44644464
const video_nums = vision_tokens.filter(x => x == video_token_id).length;
44654465

4466+
/** @type {number[][]} */
44664467
let llm_pos_ids_list = [];
44674468
let st = 0;
44684469
let remain_images = image_nums;
@@ -4532,6 +4533,7 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
45324533
// NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
45334534
// meaning to perform concatenation along dim=1, we can do the following:
45344535
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
4536+
/** @type {number[]} */
45354537
const llm_positions = new Array(num_items);
45364538
let index = 0;
45374539
for (let x = 0; x < 3; ++x) {
@@ -4572,9 +4574,10 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
45724574
{ length: 3 * data.length },
45734575
(_, i) => data[i % data.length]
45744576
);
4577+
/** @type {bigint[]} */
45754578
const mrope_position_deltas = Array.from(
45764579
{ length: dims[0] },
4577-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1 + dims[1]
4580+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
45784581
);
45794582

45804583
return [
@@ -5145,7 +5148,7 @@ export class DPTModel extends DPTPreTrainedModel { }
51455148
*
51465149
* **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
51475150
* ```javascript
5148-
* import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
5151+
* import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
51495152
*
51505153
* // Load model and processor
51515154
* const model_id = 'Xenova/dpt-hybrid-midas';
@@ -5154,7 +5157,7 @@ export class DPTModel extends DPTPreTrainedModel { }
51545157
*
51555158
* // Load image from URL
51565159
* const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
5157-
* const image = await RawImage.fromURL(url);
5160+
* const image = await RawImage.read(url);
51585161
*
51595162
* // Prepare image for the model
51605163
* const inputs = await processor(image);
@@ -5163,10 +5166,15 @@ export class DPTModel extends DPTPreTrainedModel { }
51635166
* const { predicted_depth } = await model(inputs);
51645167
*
51655168
* // Interpolate to original size
5166-
* const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
5169+
* const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
5170+
* size: image.size.reverse(),
5171+
* mode: 'bilinear',
5172+
* })).squeeze(1);
51675173
*
51685174
* // Visualize the prediction
5169-
* const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
5175+
* const min = prediction.min().item();
5176+
* const max = prediction.max().item();
5177+
* const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
51705178
* const depth = RawImage.fromTensor(formatted);
51715179
* // RawImage {
51725180
* // data: Uint8Array(307200) [ 85, 85, 84, ... ],
@@ -5216,11 +5224,7 @@ export class GLPNPreTrainedModel extends PreTrainedModel { }
52165224
export class GLPNModel extends GLPNPreTrainedModel { }
52175225

52185226
/**
5219-
* GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
5220-
*
5221-
* **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
5222-
* ```javascript
5223-
* import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
5227+
* import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
52245228
*
52255229
* // Load model and processor
52265230
* const model_id = 'Xenova/glpn-kitti';
@@ -5229,7 +5233,7 @@ export class GLPNModel extends GLPNPreTrainedModel { }
52295233
*
52305234
* // Load image from URL
52315235
* const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
5232-
* const image = await RawImage.fromURL(url);
5236+
* const image = await RawImage.read(url);
52335237
*
52345238
* // Prepare image for the model
52355239
* const inputs = await processor(image);
@@ -5238,13 +5242,18 @@ export class GLPNModel extends GLPNPreTrainedModel { }
52385242
* const { predicted_depth } = await model(inputs);
52395243
*
52405244
* // Interpolate to original size
5241-
* const prediction = interpolate(predicted_depth, image.size.reverse(), 'bilinear', false);
5245+
* const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
5246+
* size: image.size.reverse(),
5247+
* mode: 'bilinear',
5248+
* })).squeeze(1);
52425249
*
52435250
* // Visualize the prediction
5244-
* const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
5251+
* const min = prediction.min().item();
5252+
* const max = prediction.max().item();
5253+
* const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
52455254
* const depth = RawImage.fromTensor(formatted);
52465255
* // RawImage {
5247-
* // data: Uint8Array(307200) [ 207, 169, 154, ... ],
5256+
* // data: Uint8Array(307200) [ 85, 85, 84, ... ],
52485257
* // width: 640,
52495258
* // height: 480,
52505259
* // channels: 1
@@ -7747,10 +7756,17 @@ export class SequenceClassifierOutput extends ModelOutput {
77477756
/**
77487757
* @param {Object} output The output of the model.
77497758
* @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
7759+
* @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
7760+
* Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
77507761
*/
7751-
constructor({ logits }) {
7762+
constructor({ logits, ...attentions }) {
77527763
super();
77537764
this.logits = logits;
7765+
const attentions_list = Object.values(attentions);
7766+
if (attentions_list.length > 0) {
7767+
// Only set attentions if they are not empty
7768+
this.attentions = attentions_list;
7769+
}
77547770
}
77557771
}
77567772

src/models/idefics3/image_processing_idefics3.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,8 @@ export class Idefics3ImageProcessor extends ImageProcessor {
146146

147147
const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
148148
const end_offset = (i + 1) * pixel_attention_mask_stride;
149+
150+
// @ts-expect-error
149151
pixel_attention_mask_data.fill(false, start_offset, end_offset);
150152
}
151153
}

src/models/pyannote/feature_extraction_pyannote.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ export class PyAnnoteFeatureExtractor extends FeatureExtractor {
5252

5353
let current_speaker = -1;
5454
for (let i = 0; i < scores.length; ++i) {
55+
/** @type {number[]} */
5556
const probabilities = softmax(scores[i]);
5657
const [score, id] = max(probabilities);
5758
const [start, end] = [i, i + 1];

src/models/seamless_m4t/feature_extraction_seamless_m4t.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,8 @@ export class SeamlessM4TFeatureExtractor extends FeatureExtractor {
133133
'int64',
134134
new BigInt64Array(numPaddedFrames),
135135
[1, numPaddedFrames],
136-
)
137-
padded_attention_mask.data.fill(1n, 0, num_frames);
136+
);
137+
/** @type {BigInt64Array} */ (padded_attention_mask.data).fill(1n, 0, num_frames);
138138
}
139139
}
140140
}

src/models/whisper/feature_extraction_whisper.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
4444
)
4545

4646
const data = features.data;
47-
const maxValue = max(data)[0];
47+
const maxValue = max(/** @type {Float32Array} */(data))[0];
4848

4949
for (let i = 0; i < data.length; ++i) {
5050
data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;

src/ops/registry.js

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,16 @@ export class TensorOpRegistry {
3636
// executionProviders: ['webgpu'],
3737
};
3838

39+
static get nearest_interpolate_4d() {
40+
if (!this._nearest_interpolate_4d) {
41+
this._nearest_interpolate_4d = wrap(
42+
[8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
43+
this.session_options,
44+
'y',
45+
);
46+
}
47+
return this._nearest_interpolate_4d;
48+
}
3949
static get bilinear_interpolate_4d() {
4050
if (!this._bilinear_interpolate_4d) {
4151
this._bilinear_interpolate_4d = wrap(

src/pipelines.js

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ import {
6969
import {
7070
Tensor,
7171
mean_pooling,
72-
interpolate,
72+
interpolate_4d,
7373
quantize_embeddings,
7474
topk,
7575
} from './utils/tensor.js';
@@ -2901,11 +2901,23 @@ export class DepthEstimationPipeline extends (/** @type {new (options: ImagePipe
29012901

29022902
const toReturn = [];
29032903
for (let i = 0; i < preparedImages.length; ++i) {
2904-
const prediction = interpolate(predicted_depth[i], preparedImages[i].size.reverse(), 'bilinear', false);
2905-
const formatted = prediction.mul_(255 / max(prediction.data)[0]).to('uint8');
2904+
const batch = predicted_depth[i];
2905+
const [height, width] = batch.dims.slice(-2);
2906+
const [new_width, new_height] = preparedImages[i].size;
2907+
2908+
// Interpolate to original size
2909+
const prediction = (await interpolate_4d(batch.view(1, 1, height, width), {
2910+
size: [new_height, new_width],
2911+
mode: 'bilinear',
2912+
})).view(new_height, new_width);
2913+
2914+
const minval = /** @type {number} */(prediction.min().item());
2915+
const maxval = /** @type {number} */(prediction.max().item());
2916+
const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
2917+
const depth = RawImage.fromTensor(formatted);
29062918
toReturn.push({
2907-
predicted_depth: predicted_depth[i],
2908-
depth: RawImage.fromTensor(formatted),
2919+
predicted_depth: prediction,
2920+
depth,
29092921
});
29102922
}
29112923

src/tokenizers.js

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -533,19 +533,18 @@ class Unigram extends TokenizerModel {
533533
* Create a new Unigram tokenizer model.
534534
* @param {Object} config The configuration object for the Unigram model.
535535
* @param {number} config.unk_id The ID of the unknown token
536-
* @param {any[][]} config.vocab A 2D array representing a mapping of tokens to scores.
536+
* @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores.
537537
* @param {Object} moreConfig Additional configuration object for the Unigram model.
538538
*/
539539
constructor(config, moreConfig) {
540540
super(config);
541541

542542
const vocabSize = config.vocab.length;
543543
this.vocab = new Array(vocabSize);
544+
/** @type {number[]} */
544545
this.scores = new Array(vocabSize);
545546
for (let i = 0; i < vocabSize; ++i) {
546-
const piece = config.vocab[i];
547-
this.vocab[i] = piece[0];
548-
this.scores[i] = piece[1];
547+
[this.vocab[i], this.scores[i]] = config.vocab[i];
549548
}
550549

551550
this.unk_token_id = config.unk_id;

src/utils/maths.js

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -225,8 +225,9 @@ export function magnitude(arr) {
225225

226226
/**
227227
* Returns the value and index of the minimum element in an array.
228-
* @param {number[]|TypedArray} arr array of numbers.
229-
* @returns {[number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
228+
* @template {number[]|bigint[]|AnyTypedArray} T
229+
* @param {T} arr array of numbers.
230+
* @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
230231
* @throws {Error} If array is empty.
231232
*/
232233
export function min(arr) {
@@ -239,14 +240,15 @@ export function min(arr) {
239240
indexOfMin = i;
240241
}
241242
}
242-
return [min, indexOfMin];
243+
return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([min, indexOfMin]);
243244
}
244245

245246

246247
/**
247248
* Returns the value and index of the maximum element in an array.
248-
* @param {number[]|AnyTypedArray} arr array of numbers.
249-
* @returns {[number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
249+
* @template {number[]|bigint[]|AnyTypedArray} T
250+
* @param {T} arr array of numbers.
251+
* @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
250252
* @throws {Error} If array is empty.
251253
*/
252254
export function max(arr) {
@@ -259,7 +261,7 @@ export function max(arr) {
259261
indexOfMax = i;
260262
}
261263
}
262-
return [Number(max), indexOfMax];
264+
return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([max, indexOfMax]);
263265
}
264266

265267
function isPowerOfTwo(number) {

src/utils/tensor.js

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
import {
1111
interpolate_data,
12+
max,
13+
min,
1214
permute_data
1315
} from './maths.js';
1416

@@ -464,8 +466,6 @@ export class Tensor {
464466
return this.permute(...dims);
465467
}
466468

467-
// TODO add .max() and .min() methods
468-
469469
/**
470470
* Returns the sum of each row of the input tensor in the given dimension dim.
471471
*
@@ -759,6 +759,36 @@ export class Tensor {
759759
return mean(this, dim, keepdim);
760760
}
761761

762+
min(dim = null, keepdim = false) {
763+
if (dim !== null) {
764+
throw new Error("`dim !== null` not yet implemented.");
765+
}
766+
const value = min(this.data)[0];
767+
return new Tensor(this.type, [value], []);
768+
}
769+
max(dim = null, keepdim = false) {
770+
if (dim !== null) {
771+
throw new Error("`dim !== null` not yet implemented.");
772+
}
773+
const value = max(this.data)[0];
774+
return new Tensor(this.type, [value], []);
775+
}
776+
777+
argmin(dim = null, keepdim = false) {
778+
if (dim !== null) {
779+
throw new Error("`dim !== null` not yet implemented.");
780+
}
781+
const index = min(this.data)[1];
782+
return new Tensor('int64', [BigInt(index)], []);
783+
}
784+
argmax(dim = null, keepdim = false) {
785+
if (dim !== null) {
786+
throw new Error("`dim !== null` not yet implemented.");
787+
}
788+
const index = max(this.data)[1];
789+
return new Tensor('int64', [BigInt(index)], []);
790+
}
791+
762792
/**
763793
* Performs Tensor dtype conversion.
764794
* @param {DataType} type The desired data type.
@@ -892,7 +922,7 @@ export function interpolate(input, [out_height, out_width], mode = 'bilinear', a
892922
* @param {Tensor} input the input tensor
893923
* @param {Object} options the options for the interpolation
894924
* @param {[number, number]|[number, number, number]|[number, number, number, number]} [options.size=null] output spatial size.
895-
* @param {"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
925+
* @param {"nearest"|"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
896926
* @returns {Promise<Tensor>} The interpolated tensor.
897927
*/
898928
export async function interpolate_4d(input, {
@@ -922,7 +952,9 @@ export async function interpolate_4d(input, {
922952
}
923953

924954
let op;
925-
if (mode === 'bilinear') {
955+
if (mode === 'nearest') {
956+
op = await TensorOpRegistry.nearest_interpolate_4d;
957+
} else if (mode === 'bilinear') {
926958
op = await TensorOpRegistry.bilinear_interpolate_4d;
927959
} else if (mode === 'bicubic') {
928960
op = await TensorOpRegistry.bicubic_interpolate_4d;
@@ -963,13 +995,13 @@ export async function rfft(x, a) {
963995
* Returns the k largest elements of the given input tensor.
964996
* Inspired by https://pytorch.org/docs/stable/generated/torch.topk.html
965997
* @param {Tensor} x the input tensor
966-
* @param {number} k the k in "top-k"
998+
* @param {number} [k] the k in "top-k"
967999
* @returns {Promise<[Tensor, Tensor]>} the output tuple of (Tensor, LongTensor) of top-k elements and their indices.
9681000
*/
9691001
export async function topk(x, k) {
9701002
const op = await TensorOpRegistry.top_k;
9711003

972-
if (k === null) {
1004+
if (k == null) {
9731005
k = x.dims.at(-1);
9741006
} else {
9751007
k = Math.min(k, x.dims.at(-1));
@@ -998,10 +1030,10 @@ const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length])
9981030
export async function slice(data, starts, ends, axes, steps) {
9991031
const op = await TensorOpRegistry.slice;
10001032
return await op({
1001-
x: data,
1002-
s: arrayToIndexTensor(starts),
1003-
e: arrayToIndexTensor(ends),
1004-
a: arrayToIndexTensor(axes),
1033+
x: data,
1034+
s: arrayToIndexTensor(starts),
1035+
e: arrayToIndexTensor(ends),
1036+
a: arrayToIndexTensor(axes),
10051037
t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
10061038
});
10071039
}

0 commit comments

Comments
 (0)