@@ -270,8 +270,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
270270 } else if ( session_options . externalData !== undefined ) {
271271 externalDataPromises = session_options . externalData . map ( async ( ext ) => {
272272 // if the external data is a string, fetch the file and replace the string with its content
273+ // @ts -expect-error TS2339
273274 if ( typeof ext . data === "string" ) {
275+ // @ts -expect-error TS2339
274276 const ext_buffer = await getModelFile ( pretrained_model_name_or_path , ext . data , true , options ) ;
277+ // @ts -expect-error TS2698
275278 return { ...ext , data : ext_buffer } ;
276279 }
277280 return ext ;
@@ -1519,6 +1522,7 @@ export class PreTrainedModel extends Callable {
15191522 if ( this . config . model_type === 'musicgen' ) {
15201523 // Custom logic (TODO: move to Musicgen class)
15211524 decoder_input_ids = Array . from ( {
1525+ // @ts -expect-error TS2339
15221526 length : batch_size * this . config . decoder . num_codebooks
15231527 } , ( ) => [ decoder_start_token_id ] ) ;
15241528
@@ -1848,11 +1852,13 @@ export class PreTrainedModel extends Callable {
18481852 async encode_image ( { pixel_values } ) {
18491853 // image_inputs === { pixel_values }
18501854 const features = ( await sessionRun ( this . sessions [ 'vision_encoder' ] , { pixel_values } ) ) . image_features ;
1855+ // @ts -expect-error TS2339
18511856 if ( ! this . config . num_image_tokens ) {
18521857 console . warn (
18531858 'The number of image tokens was not set in the model configuration. ' +
18541859 `Setting it to the number of features detected by the vision encoder (${ features . dims [ 1 ] } ).`
18551860 )
1861+ // @ts -expect-error TS2339
18561862 this . config . num_image_tokens = features . dims [ 1 ] ;
18571863 }
18581864 return features ;
@@ -3280,6 +3286,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
32803286
32813287 if ( generation_config . return_token_timestamps ) {
32823288 outputs [ "token_timestamps" ] = this . _extract_token_timestamps (
3289+ // @ts -expect-error TS2345
32833290 outputs ,
32843291 generation_config . alignment_heads ,
32853292 generation_config . num_frames ,
@@ -3315,6 +3322,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
33153322 ) ;
33163323 }
33173324
3325+ // @ts -expect-error TS2339
33183326 let median_filter_width = this . config . median_filter_width ;
33193327 if ( median_filter_width === undefined ) {
33203328 console . warn ( "Model config has no `median_filter_width`, using default value of 7." )
@@ -3325,6 +3333,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
33253333 const batch = generate_outputs . cross_attentions ;
33263334 // Create a list with `decoder_layers` elements, each a tensor of shape
33273335 // (batch size, attention_heads, output length, input length).
3336+ // @ts -expect-error TS2339
33283337 const cross_attentions = Array . from ( { length : this . config . decoder_layers } ,
33293338 // Concatenate the cross attentions for each layer across sequence length dimension.
33303339 ( _ , i ) => cat ( batch . map ( x => x [ i ] ) , 2 )
@@ -3468,6 +3477,7 @@ export class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
34683477 attention_mask,
34693478 } ) {
34703479
3480+ // @ts -expect-error TS2339
34713481 const image_token_index = this . config . image_token_index ;
34723482
34733483 const idsList = input_ids . tolist ( ) ;
@@ -6210,10 +6220,12 @@ export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
62106220
62116221 const { encoder_outputs, encoder_attention_mask } = await encoderForward ( this , model_inputs ) ;
62126222
6223+ // @ts -expect-error TS2339
62136224 const r = encoder_outputs . dims [ 1 ] / this . config . reduction_factor ;
62146225 const maxlen = Math . floor ( r * maxlenratio ) ;
62156226 const minlen = Math . floor ( r * minlenratio ) ;
62166227
6228+ // @ts -expect-error TS2339
62176229 const num_mel_bins = this . config . num_mel_bins ;
62186230
62196231 let spectrogramParts = [ ] ;
@@ -6578,11 +6590,13 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
65786590 */
65796591 _apply_and_filter_by_delay_pattern_mask ( outputs ) {
65806592 const [ bs_x_codebooks , seqLength ] = outputs . dims ;
6593+ // @ts -expect-error TS2339
65816594 const num_codebooks = this . config . decoder . num_codebooks ;
65826595 const upperBound = ( seqLength - num_codebooks ) ;
65836596
65846597 let newDataSize = 0 ;
65856598 for ( let i = 0 ; i < outputs . size ; ++ i ) {
6599+ // @ts -expect-error TS2339
65866600 if ( outputs . data [ i ] === this . config . decoder . pad_token_id ) {
65876601 continue ;
65886602 }
@@ -6612,7 +6626,9 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
66126626 let clonedInputIds = structuredClone ( input_ids ) ;
66136627 for ( let i = 0 ; i < clonedInputIds . length ; ++ i ) {
66146628 for ( let j = 0 ; j < clonedInputIds [ i ] . length ; ++ j ) {
6629+ // @ts -expect-error TS2339
66156630 if ( ( i % this . config . decoder . num_codebooks ) >= j ) {
6631+ // @ts -expect-error TS2339
66166632 clonedInputIds [ i ] [ j ] = BigInt ( this . config . decoder . pad_token_id ) ;
66176633 }
66186634 }
@@ -6769,6 +6785,9 @@ export class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
67696785 'past_key_values' ,
67706786 ] ;
67716787
6788+ /**
6789+ * @param {ConstructorParameters<typeof MultiModalityPreTrainedModel> } args
6790+ */
67726791 constructor ( ...args ) {
67736792 super ( ...args ) ;
67746793
0 commit comments