diff --git a/package.json b/package.json index 361158609..00d1d3622 100644 --- a/package.json +++ b/package.json @@ -24,7 +24,7 @@ "scripts": { "format": "prettier --write .", "format:check": "prettier --check .", - "typegen": "tsc ./src/transformers.js --allowJs --declaration --emitDeclarationOnly --declarationMap --outDir types", + "typegen": "tsc --build", "dev": "webpack serve --no-client-overlay", "build": "webpack && npm run typegen", "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --verbose", diff --git a/src/base/image_processors_utils.js b/src/base/image_processors_utils.js index 6788258f6..ce5bf2298 100644 --- a/src/base/image_processors_utils.js +++ b/src/base/image_processors_utils.js @@ -604,14 +604,20 @@ export class ImageProcessor extends Callable { this.do_thumbnail = config.do_thumbnail; this.size = config.size ?? config.image_size; this.do_resize = config.do_resize ?? (this.size !== undefined); + // @ts-expect-error TS2339 this.size_divisibility = config.size_divisibility ?? config.size_divisor; this.do_center_crop = config.do_center_crop; + // @ts-expect-error TS2339 this.crop_size = config.crop_size; + // @ts-expect-error TS2339 this.do_convert_rgb = config.do_convert_rgb ?? true; + // @ts-expect-error TS2339 this.do_crop_margin = config.do_crop_margin; + // @ts-expect-error TS2339 this.pad_size = config.pad_size; + // @ts-expect-error TS2339 this.do_pad = config.do_pad; if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) { @@ -818,6 +824,7 @@ export class ImageProcessor extends Callable { // Support both formats for backwards compatibility else if (Number.isInteger(size)) { shortest_edge = size; + // @ts-expect-error TS2339 longest_edge = this.config.max_size ?? shortest_edge; } else if (size !== undefined) { @@ -886,6 +893,7 @@ export class ImageProcessor extends Callable { } else if (size.min_pixels !== undefined && size.max_pixels !== undefined) { // Custom resize logic for Qwen2-VL models const { min_pixels, max_pixels } = size; + // @ts-expect-error TS2339 const factor = this.config.patch_size * this.config.merge_size; return smart_resize(srcHeight, srcWidth, factor, min_pixels, max_pixels); } else { @@ -901,6 +909,7 @@ export class ImageProcessor extends Callable { async resize(image) { const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size); return await image.resize(newWidth, newHeight, { + // @ts-expect-error TS2322 resample: this.resample, }); } @@ -951,6 +960,7 @@ export class ImageProcessor extends Callable { // Resize the image using thumbnail method. if (this.do_thumbnail) { + // @ts-expect-error TS2345 image = await this.thumbnail(image, this.size, this.resample); } @@ -975,6 +985,7 @@ export class ImageProcessor extends Callable { // NOTE: All pixel-level manipulation (i.e., modifying `pixelData`) // occurs with data in the hwc format (height, width, channels), // to emulate the behavior of the original Python code (w/ numpy). + /** @type {Float32Array} */ let pixelData = Float32Array.from(image.data); let imgDims = [image.height, image.width, image.channels]; diff --git a/src/base/processing_utils.js b/src/base/processing_utils.js index 2e457e20d..ecc1c6155 100644 --- a/src/base/processing_utils.js +++ b/src/base/processing_utils.js @@ -28,6 +28,7 @@ import { getModelJSON } from '../utils/hub.js'; /** * @typedef {Object} ProcessorProperties Additional processor-specific properties. * @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions + * @typedef {import('../tokenizers.js').PreTrainedTokenizer} PreTrainedTokenizer */ @@ -61,7 +62,7 @@ export class Processor extends Callable { } /** - * @returns {import('../tokenizers.js').PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists. + * @returns {PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists. */ get tokenizer() { return this.components.tokenizer; @@ -74,6 +75,11 @@ export class Processor extends Callable { return this.components.feature_extractor; } + /** + * @param {Parameters[0]} messages + * @param {Parameters[1]} options + * @returns {ReturnType} + */ apply_chat_template(messages, options = {}) { if (!this.tokenizer) { throw new Error('Unable to apply chat template without a tokenizer.'); @@ -84,6 +90,10 @@ export class Processor extends Callable { }); } + /** + * @param {Parameters} args + * @returns {ReturnType} + */ batch_decode(...args) { if (!this.tokenizer) { throw new Error('Unable to decode without a tokenizer.'); diff --git a/src/configs.js b/src/configs.js index a40bb59d9..ce01139e6 100644 --- a/src/configs.js +++ b/src/configs.js @@ -70,15 +70,19 @@ function getNormalizedConfig(config) { case 'florence2': case 'llava_onevision': case 'idefics3': + // @ts-expect-error TS2339 init_normalized_config = getNormalizedConfig(config.text_config); break; case 'moondream1': + // @ts-expect-error TS2339 init_normalized_config = getNormalizedConfig(config.phi_config); break; case 'musicgen': + // @ts-expect-error TS2339 init_normalized_config = getNormalizedConfig(config.decoder); break; case 'multi_modality': + // @ts-expect-error TS2339 init_normalized_config = getNormalizedConfig(config.language_config); break; @@ -191,6 +195,7 @@ function getNormalizedConfig(config) { break; case 'vision-encoder-decoder': + // @ts-expect-error TS2339 const decoderConfig = getNormalizedConfig(config.decoder); const add_encoder_pkv = 'num_decoder_layers' in decoderConfig; diff --git a/src/models.js b/src/models.js index f8242b5f0..fe4d46365 100644 --- a/src/models.js +++ b/src/models.js @@ -269,8 +269,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) { } else if (session_options.externalData !== undefined) { externalDataPromises = session_options.externalData.map(async (ext) => { // if the external data is a string, fetch the file and replace the string with its content + // @ts-expect-error TS2339 if (typeof ext.data === "string") { + // @ts-expect-error TS2339 const ext_buffer = await getModelFile(pretrained_model_name_or_path, ext.data, true, options); + // @ts-expect-error TS2698 return { ...ext, data: ext_buffer }; } return ext; @@ -1502,6 +1505,7 @@ export class PreTrainedModel extends Callable { if (this.config.model_type === 'musicgen') { // Custom logic (TODO: move to Musicgen class) decoder_input_ids = Array.from({ + // @ts-expect-error TS2339 length: batch_size * this.config.decoder.num_codebooks }, () => [decoder_start_token_id]); @@ -1831,11 +1835,13 @@ export class PreTrainedModel extends Callable { async encode_image({ pixel_values }) { // image_inputs === { pixel_values } const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features; + // @ts-expect-error TS2339 if (!this.config.num_image_tokens) { console.warn( 'The number of image tokens was not set in the model configuration. ' + `Setting it to the number of features detected by the vision encoder (${features.dims[1]}).` ) + // @ts-expect-error TS2339 this.config.num_image_tokens = features.dims[1]; } return features; @@ -3220,6 +3226,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel { if (generation_config.return_token_timestamps) { outputs["token_timestamps"] = this._extract_token_timestamps( + // @ts-expect-error TS2345 outputs, generation_config.alignment_heads, generation_config.num_frames, @@ -3255,6 +3262,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel { ); } + // @ts-expect-error TS2339 let median_filter_width = this.config.median_filter_width; if (median_filter_width === undefined) { console.warn("Model config has no `median_filter_width`, using default value of 7.") @@ -3265,6 +3273,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel { const batch = generate_outputs.cross_attentions; // Create a list with `decoder_layers` elements, each a tensor of shape // (batch size, attention_heads, output length, input length). + // @ts-expect-error TS2339 const cross_attentions = Array.from({ length: this.config.decoder_layers }, // Concatenate the cross attentions for each layer across sequence length dimension. (_, i) => cat(batch.map(x => x[i]), 2) @@ -3385,6 +3394,7 @@ export class LlavaForConditionalGeneration extends LlavaPreTrainedModel { attention_mask, }) { + // @ts-expect-error TS2339 const image_token_index = this.config.image_token_index; const idsList = input_ids.tolist(); @@ -6003,10 +6013,12 @@ export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel { const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs); + // @ts-expect-error TS2339 const r = encoder_outputs.dims[1] / this.config.reduction_factor; const maxlen = Math.floor(r * maxlenratio); const minlen = Math.floor(r * minlenratio); + // @ts-expect-error TS2339 const num_mel_bins = this.config.num_mel_bins; let spectrogramParts = []; @@ -6367,11 +6379,13 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: */ _apply_and_filter_by_delay_pattern_mask(outputs) { const [bs_x_codebooks, seqLength] = outputs.dims; + // @ts-expect-error TS2339 const num_codebooks = this.config.decoder.num_codebooks; const upperBound = (seqLength - num_codebooks); let newDataSize = 0; for (let i = 0; i < outputs.size; ++i) { + // @ts-expect-error TS2339 if (outputs.data[i] === this.config.decoder.pad_token_id) { continue; } @@ -6401,7 +6415,9 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: let clonedInputIds = structuredClone(input_ids); for (let i = 0; i < clonedInputIds.length; ++i) { for (let j = 0; j < clonedInputIds[i].length; ++j) { + // @ts-expect-error TS2339 if ((i % this.config.decoder.num_codebooks) >= j) { + // @ts-expect-error TS2339 clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id); } } @@ -6558,6 +6574,9 @@ export class MultiModalityCausalLM extends MultiModalityPreTrainedModel { 'past_key_values', ]; + /** + * @param {ConstructorParameters} args + */ constructor(...args) { super(...args); diff --git a/src/models/convnext/image_processing_convnext.js b/src/models/convnext/image_processing_convnext.js index 525e736cd..7b867a60a 100644 --- a/src/models/convnext/image_processing_convnext.js +++ b/src/models/convnext/image_processing_convnext.js @@ -9,6 +9,7 @@ export class ConvNextImageProcessor extends ImageProcessor { /** * Percentage of the image to crop. Only has an effect if this.size < 384. */ + // @ts-expect-error TS2339 this.crop_pct = this.config.crop_pct ?? (224 / 256); } diff --git a/src/models/efficientnet/image_processing_efficientnet.js b/src/models/efficientnet/image_processing_efficientnet.js index 9fde87156..29a594e18 100644 --- a/src/models/efficientnet/image_processing_efficientnet.js +++ b/src/models/efficientnet/image_processing_efficientnet.js @@ -5,6 +5,7 @@ import { export class EfficientNetImageProcessor extends ImageProcessor { constructor(config) { super(config); + // @ts-expect-error TS2339 this.include_top = this.config.include_top ?? true; if (this.include_top) { this.image_std = this.image_std.map(x => x * x); diff --git a/src/models/florence2/processing_florence2.js b/src/models/florence2/processing_florence2.js index ec644df25..5c1dfcc24 100644 --- a/src/models/florence2/processing_florence2.js +++ b/src/models/florence2/processing_florence2.js @@ -10,8 +10,11 @@ export class Florence2Processor extends Processor { super(config, components); const { + // @ts-expect-error TS2339 tasks_answer_post_processing_type, + // @ts-expect-error TS2339 task_prompts_without_inputs, + // @ts-expect-error TS2339 task_prompts_with_input, } = this.image_processor.config; diff --git a/src/models/janus/image_processing_janus.js b/src/models/janus/image_processing_janus.js index 4dae64ff4..ff191855a 100644 --- a/src/models/janus/image_processing_janus.js +++ b/src/models/janus/image_processing_janus.js @@ -13,6 +13,7 @@ export class VLMImageProcessor extends ImageProcessor { }, ...config, }); + // @ts-expect-error TS2339 this.constant_values = this.config.background_color.map(x => x * this.rescale_factor) } diff --git a/src/models/mgp_str/processing_mgp_str.js b/src/models/mgp_str/processing_mgp_str.js index eb4dbdf6e..a4da992cc 100644 --- a/src/models/mgp_str/processing_mgp_str.js +++ b/src/models/mgp_str/processing_mgp_str.js @@ -119,6 +119,8 @@ export class MgpstrProcessor extends Processor { * - bpe_preds: The list of BPE decoded sentences. * - wp_preds: The list of wp decoded sentences. */ + // @ts-expect-error The type of this method is not compatible with the one + // in the base class. It might be a good idea to fix this. batch_decode([char_logits, bpe_logits, wp_logits]) { const [char_preds, char_scores] = this._decode_helper(char_logits, 'char'); const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe'); diff --git a/src/models/paligemma/processing_paligemma.js b/src/models/paligemma/processing_paligemma.js index 9fb580cd8..63f50b99d 100644 --- a/src/models/paligemma/processing_paligemma.js +++ b/src/models/paligemma/processing_paligemma.js @@ -41,6 +41,7 @@ export class PaliGemmaProcessor extends Processor { } const bos_token = this.tokenizer.bos_token; + // @ts-expect-error TS2339 const image_seq_length = this.image_processor.config.image_seq_length; let input_strings; if (text.some((t) => t.includes(IMAGE_TOKEN))) { diff --git a/src/models/qwen2_vl/processing_qwen2_vl.js b/src/models/qwen2_vl/processing_qwen2_vl.js index d5f05535b..692987f86 100644 --- a/src/models/qwen2_vl/processing_qwen2_vl.js +++ b/src/models/qwen2_vl/processing_qwen2_vl.js @@ -28,6 +28,7 @@ export class Qwen2VLProcessor extends Processor { } if (image_grid_thw) { + // @ts-expect-error TS2551 let merge_length = this.image_processor.config.merge_size ** 2; let index = 0; diff --git a/src/pipelines.js b/src/pipelines.js index a61cb1dde..ce8dbd9cc 100644 --- a/src/pipelines.js +++ b/src/pipelines.js @@ -294,6 +294,7 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi // TODO: Use softmax tensor function const function_to_apply = + // @ts-expect-error TS2339 this.model.config.problem_type === 'multi_label_classification' ? batch => batch.sigmoid() : batch => new Tensor( @@ -302,6 +303,7 @@ export class TextClassificationPipeline extends (/** @type {new (options: TextPi batch.dims, ); // single_label_classification (default) + // @ts-expect-error TS2339 const id2label = this.model.config.id2label; const toReturn = []; @@ -404,6 +406,7 @@ export class TokenClassificationPipeline extends (/** @type {new (options: TextP const outputs = await this.model(model_inputs) const logits = outputs.logits; + // @ts-expect-error TS2339 const id2label = this.model.config.id2label; const toReturn = []; @@ -743,11 +746,14 @@ export class Text2TextGenerationPipeline extends (/** @type {new (options: TextP // Add global prefix, if present + // @ts-expect-error TS2339 if (this.model.config.prefix) { + // @ts-expect-error TS2339 texts = texts.map(x => this.model.config.prefix + x) } // Handle task specific params: + // @ts-expect-error TS2339 const task_specific_params = this.model.config.task_specific_params if (task_specific_params && task_specific_params[this.task]) { // Add prefixes, if present @@ -1486,6 +1492,7 @@ export class AudioClassificationPipeline extends (/** @type {new (options: Audio const sampling_rate = this.processor.feature_extractor.config.sampling_rate; const preparedAudios = await prepareAudios(audio, sampling_rate); + // @ts-expect-error TS2339 const id2label = this.model.config.id2label; const toReturn = []; @@ -1794,6 +1801,7 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options audio = [/** @type {AudioInput} */ (audio)]; } + // @ts-expect-error TS2339 const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions; const hop_length = this.processor.feature_extractor.config.hop_length; @@ -1859,7 +1867,9 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options // TODO: Right now we only get top beam if (return_timestamps === 'word') { + // @ts-expect-error TS2339 chunk.tokens = data.sequences.tolist()[0]; + // @ts-expect-error TS2339 chunk.token_timestamps = data.token_timestamps.tolist()[0].map( (/** @type {number} */ x) => round(x, 2) ); @@ -2025,6 +2035,7 @@ export class ImageClassificationPipeline extends (/** @type {new (options: Image const { pixel_values } = await this.processor(preparedImages); const output = await this.model({ pixel_values }); + // @ts-expect-error TS2339 const id2label = this.model.config.id2label; /** @type {ImageClassificationOutput[]} */ @@ -2139,6 +2150,7 @@ export class ImageSegmentationPipeline extends (/** @type {new (options: ImagePi } } + // @ts-expect-error TS2339 const id2label = this.model.config.id2label; /** @type {ImageSegmentationPipelineOutput[]} */ @@ -2365,6 +2377,7 @@ export class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipe const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSizes); // Add labels + // @ts-expect-error TS2339 const id2label = this.model.config.id2label; // Format output @@ -2584,6 +2597,7 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: // Run model const output = await this.model.generate({ inputs: pixel_values, + // @ts-expect-error TS2339 max_length: this.model.config.decoder.max_position_embeddings, decoder_input_ids, ...generate_kwargs, @@ -2699,6 +2713,7 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi // Generate waveform const { waveform } = await this.model(inputs); + // @ts-expect-error TS2339 const sampling_rate = this.model.config.sampling_rate; return { audio: waveform.data, @@ -3338,4 +3353,4 @@ async function loadItems(mapping, model, pretrainedOptions) { } return result; -} \ No newline at end of file +} diff --git a/src/utils/dtypes.js b/src/utils/dtypes.js index 0d6e190e0..845eef5e0 100644 --- a/src/utils/dtypes.js +++ b/src/utils/dtypes.js @@ -1,3 +1,5 @@ +/// + import { apis } from "../env.js"; import { DEVICE_TYPES } from "./devices.js"; diff --git a/src/utils/hub.js b/src/utils/hub.js index c5fcfacf1..17ee4c1b1 100755 --- a/src/utils/hub.js +++ b/src/utils/hub.js @@ -121,7 +121,7 @@ class FileResponse { */ async arrayBuffer() { const data = await fs.promises.readFile(this.filePath); - return data.buffer; + return /** @type {ArrayBuffer} */ (data.buffer); } /** diff --git a/jsconfig.json b/tsconfig.json similarity index 56% rename from jsconfig.json rename to tsconfig.json index 9af7d54be..fb6de7097 100644 --- a/jsconfig.json +++ b/tsconfig.json @@ -6,7 +6,14 @@ "checkJs": true, "target": "esnext", "module": "nodenext", - "moduleResolution": "nodenext" + "moduleResolution": "nodenext", + "outDir": "types", + "strict": false, + "skipLibCheck": true, + "declaration": true, + "declarationMap": true, + "noEmit": false, + "emitDeclarationOnly": true }, "typeAcquisition": { "include": ["jest"]