Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"scripts": {
"format": "prettier --write .",
"format:check": "prettier --check .",
"typegen": "tsc ./src/transformers.js --allowJs --declaration --emitDeclarationOnly --declarationMap --outDir types",
"typegen": "tsc --build",
"dev": "webpack serve --no-client-overlay",
"build": "webpack && npm run typegen",
"test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --verbose",
Expand Down
11 changes: 11 additions & 0 deletions src/base/image_processors_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -604,14 +604,20 @@ export class ImageProcessor extends Callable {
this.do_thumbnail = config.do_thumbnail;
this.size = config.size ?? config.image_size;
this.do_resize = config.do_resize ?? (this.size !== undefined);
// @ts-expect-error TS2339
this.size_divisibility = config.size_divisibility ?? config.size_divisor;

this.do_center_crop = config.do_center_crop;
// @ts-expect-error TS2339
this.crop_size = config.crop_size;
// @ts-expect-error TS2339
this.do_convert_rgb = config.do_convert_rgb ?? true;
// @ts-expect-error TS2339
this.do_crop_margin = config.do_crop_margin;

// @ts-expect-error TS2339
this.pad_size = config.pad_size;
// @ts-expect-error TS2339
this.do_pad = config.do_pad;

if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) {
Expand Down Expand Up @@ -818,6 +824,7 @@ export class ImageProcessor extends Callable {
// Support both formats for backwards compatibility
else if (Number.isInteger(size)) {
shortest_edge = size;
// @ts-expect-error TS2339
longest_edge = this.config.max_size ?? shortest_edge;

} else if (size !== undefined) {
Expand Down Expand Up @@ -886,6 +893,7 @@ export class ImageProcessor extends Callable {
} else if (size.min_pixels !== undefined && size.max_pixels !== undefined) {
// Custom resize logic for Qwen2-VL models
const { min_pixels, max_pixels } = size;
// @ts-expect-error TS2339
const factor = this.config.patch_size * this.config.merge_size;
return smart_resize(srcHeight, srcWidth, factor, min_pixels, max_pixels);
} else {
Expand All @@ -901,6 +909,7 @@ export class ImageProcessor extends Callable {
async resize(image) {
const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size);
return await image.resize(newWidth, newHeight, {
// @ts-expect-error TS2322
resample: this.resample,
});
}
Expand Down Expand Up @@ -951,6 +960,7 @@ export class ImageProcessor extends Callable {

// Resize the image using thumbnail method.
if (this.do_thumbnail) {
// @ts-expect-error TS2345
image = await this.thumbnail(image, this.size, this.resample);
}

Expand All @@ -975,6 +985,7 @@ export class ImageProcessor extends Callable {
// NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
// occurs with data in the hwc format (height, width, channels),
// to emulate the behavior of the original Python code (w/ numpy).
/** @type {Float32Array} */
let pixelData = Float32Array.from(image.data);
let imgDims = [image.height, image.width, image.channels];

Expand Down
12 changes: 11 additions & 1 deletion src/base/processing_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import { getModelJSON } from '../utils/hub.js';
/**
* @typedef {Object} ProcessorProperties Additional processor-specific properties.
* @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions
* @typedef {import('../tokenizers.js').PreTrainedTokenizer} PreTrainedTokenizer
*/


Expand Down Expand Up @@ -61,7 +62,7 @@ export class Processor extends Callable {
}

/**
* @returns {import('../tokenizers.js').PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
* @returns {PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
*/
get tokenizer() {
return this.components.tokenizer;
Expand All @@ -74,6 +75,11 @@ export class Processor extends Callable {
return this.components.feature_extractor;
}

/**
* @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[0]} messages
* @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[1]} options
* @returns {ReturnType<PreTrainedTokenizer['apply_chat_template']>}
*/
apply_chat_template(messages, options = {}) {
if (!this.tokenizer) {
throw new Error('Unable to apply chat template without a tokenizer.');
Expand All @@ -84,6 +90,10 @@ export class Processor extends Callable {
});
}

/**
* @param {Parameters<PreTrainedTokenizer['batch_decode']>} args
* @returns {ReturnType<PreTrainedTokenizer['batch_decode']>}
*/
batch_decode(...args) {
if (!this.tokenizer) {
throw new Error('Unable to decode without a tokenizer.');
Expand Down
5 changes: 5 additions & 0 deletions src/configs.js
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,19 @@ function getNormalizedConfig(config) {
case 'florence2':
case 'llava_onevision':
case 'idefics3':
// @ts-expect-error TS2339
init_normalized_config = getNormalizedConfig(config.text_config);
break;
case 'moondream1':
// @ts-expect-error TS2339
init_normalized_config = getNormalizedConfig(config.phi_config);
break;
case 'musicgen':
// @ts-expect-error TS2339
init_normalized_config = getNormalizedConfig(config.decoder);
break;
case 'multi_modality':
// @ts-expect-error TS2339
init_normalized_config = getNormalizedConfig(config.language_config);
break;

Expand Down Expand Up @@ -191,6 +195,7 @@ function getNormalizedConfig(config) {
break;

case 'vision-encoder-decoder':
// @ts-expect-error TS2339
const decoderConfig = getNormalizedConfig(config.decoder);

const add_encoder_pkv = 'num_decoder_layers' in decoderConfig;
Expand Down
19 changes: 19 additions & 0 deletions src/models.js
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
} else if (session_options.externalData !== undefined) {
externalDataPromises = session_options.externalData.map(async (ext) => {
// if the external data is a string, fetch the file and replace the string with its content
// @ts-expect-error TS2339
if (typeof ext.data === "string") {
// @ts-expect-error TS2339
const ext_buffer = await getModelFile(pretrained_model_name_or_path, ext.data, true, options);
// @ts-expect-error TS2698
return { ...ext, data: ext_buffer };
}
return ext;
Expand Down Expand Up @@ -1502,6 +1505,7 @@ export class PreTrainedModel extends Callable {
if (this.config.model_type === 'musicgen') {
// Custom logic (TODO: move to Musicgen class)
decoder_input_ids = Array.from({
// @ts-expect-error TS2339
length: batch_size * this.config.decoder.num_codebooks
}, () => [decoder_start_token_id]);

Expand Down Expand Up @@ -1831,11 +1835,13 @@ export class PreTrainedModel extends Callable {
async encode_image({ pixel_values }) {
// image_inputs === { pixel_values }
const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
// @ts-expect-error TS2339
if (!this.config.num_image_tokens) {
console.warn(
'The number of image tokens was not set in the model configuration. ' +
`Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
)
// @ts-expect-error TS2339
this.config.num_image_tokens = features.dims[1];
}
return features;
Expand Down Expand Up @@ -3220,6 +3226,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {

if (generation_config.return_token_timestamps) {
outputs["token_timestamps"] = this._extract_token_timestamps(
// @ts-expect-error TS2345
outputs,
generation_config.alignment_heads,
generation_config.num_frames,
Expand Down Expand Up @@ -3255,6 +3262,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
);
}

// @ts-expect-error TS2339
let median_filter_width = this.config.median_filter_width;
if (median_filter_width === undefined) {
console.warn("Model config has no `median_filter_width`, using default value of 7.")
Expand All @@ -3265,6 +3273,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
const batch = generate_outputs.cross_attentions;
// Create a list with `decoder_layers` elements, each a tensor of shape
// (batch size, attention_heads, output length, input length).
// @ts-expect-error TS2339
const cross_attentions = Array.from({ length: this.config.decoder_layers },
// Concatenate the cross attentions for each layer across sequence length dimension.
(_, i) => cat(batch.map(x => x[i]), 2)
Expand Down Expand Up @@ -3385,6 +3394,7 @@ export class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
attention_mask,
}) {

// @ts-expect-error TS2339
const image_token_index = this.config.image_token_index;

const idsList = input_ids.tolist();
Expand Down Expand Up @@ -6003,10 +6013,12 @@ export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {

const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);

// @ts-expect-error TS2339
const r = encoder_outputs.dims[1] / this.config.reduction_factor;
const maxlen = Math.floor(r * maxlenratio);
const minlen = Math.floor(r * minlenratio);

// @ts-expect-error TS2339
const num_mel_bins = this.config.num_mel_bins;

let spectrogramParts = [];
Expand Down Expand Up @@ -6367,11 +6379,13 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
*/
_apply_and_filter_by_delay_pattern_mask(outputs) {
const [bs_x_codebooks, seqLength] = outputs.dims;
// @ts-expect-error TS2339
const num_codebooks = this.config.decoder.num_codebooks;
const upperBound = (seqLength - num_codebooks);

let newDataSize = 0;
for (let i = 0; i < outputs.size; ++i) {
// @ts-expect-error TS2339
if (outputs.data[i] === this.config.decoder.pad_token_id) {
continue;
}
Expand Down Expand Up @@ -6401,7 +6415,9 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
let clonedInputIds = structuredClone(input_ids);
for (let i = 0; i < clonedInputIds.length; ++i) {
for (let j = 0; j < clonedInputIds[i].length; ++j) {
// @ts-expect-error TS2339
if ((i % this.config.decoder.num_codebooks) >= j) {
// @ts-expect-error TS2339
clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
}
}
Expand Down Expand Up @@ -6558,6 +6574,9 @@ export class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
'past_key_values',
];

/**
* @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
*/
constructor(...args) {
super(...args);

Expand Down
1 change: 1 addition & 0 deletions src/models/convnext/image_processing_convnext.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ export class ConvNextImageProcessor extends ImageProcessor {
/**
* Percentage of the image to crop. Only has an effect if this.size < 384.
*/
// @ts-expect-error TS2339
this.crop_pct = this.config.crop_pct ?? (224 / 256);
}

Expand Down
1 change: 1 addition & 0 deletions src/models/efficientnet/image_processing_efficientnet.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
export class EfficientNetImageProcessor extends ImageProcessor {
constructor(config) {
super(config);
// @ts-expect-error TS2339
this.include_top = this.config.include_top ?? true;
if (this.include_top) {
this.image_std = this.image_std.map(x => x * x);
Expand Down
3 changes: 3 additions & 0 deletions src/models/florence2/processing_florence2.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@ export class Florence2Processor extends Processor {
super(config, components);

const {
// @ts-expect-error TS2339
tasks_answer_post_processing_type,
// @ts-expect-error TS2339
task_prompts_without_inputs,
// @ts-expect-error TS2339
task_prompts_with_input,
} = this.image_processor.config;

Expand Down
1 change: 1 addition & 0 deletions src/models/janus/image_processing_janus.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export class VLMImageProcessor extends ImageProcessor {
},
...config,
});
// @ts-expect-error TS2339
this.constant_values = this.config.background_color.map(x => x * this.rescale_factor)
}

Expand Down
2 changes: 2 additions & 0 deletions src/models/mgp_str/processing_mgp_str.js
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ export class MgpstrProcessor extends Processor {
* - bpe_preds: The list of BPE decoded sentences.
* - wp_preds: The list of wp decoded sentences.
*/
// @ts-expect-error The type of this method is not compatible with the one
// in the base class. It might be a good idea to fix this.
batch_decode([char_logits, bpe_logits, wp_logits]) {
const [char_preds, char_scores] = this._decode_helper(char_logits, 'char');
const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe');
Expand Down
1 change: 1 addition & 0 deletions src/models/paligemma/processing_paligemma.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ export class PaliGemmaProcessor extends Processor {
}

const bos_token = this.tokenizer.bos_token;
// @ts-expect-error TS2339
const image_seq_length = this.image_processor.config.image_seq_length;
let input_strings;
if (text.some((t) => t.includes(IMAGE_TOKEN))) {
Expand Down
1 change: 1 addition & 0 deletions src/models/qwen2_vl/processing_qwen2_vl.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ export class Qwen2VLProcessor extends Processor {
}

if (image_grid_thw) {
// @ts-expect-error TS2551
let merge_length = this.image_processor.config.merge_size ** 2;
let index = 0;

Expand Down
Loading
Loading