@@ -270,8 +270,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
270270 } else if ( session_options . externalData !== undefined ) {
271271 externalDataPromises = session_options . externalData . map ( async ( ext ) => {
272272 // if the external data is a string, fetch the file and replace the string with its content
273+ // @ts -expect-error TS2339
273274 if ( typeof ext . data === "string" ) {
275+ // @ts -expect-error TS2339
274276 const ext_buffer = await getModelFile ( pretrained_model_name_or_path , ext . data , true , options ) ;
277+ // @ts -expect-error TS2698
275278 return { ...ext , data : ext_buffer } ;
276279 }
277280 return ext ;
@@ -1519,6 +1522,7 @@ export class PreTrainedModel extends Callable {
15191522 if ( this . config . model_type === 'musicgen' ) {
15201523 // Custom logic (TODO: move to Musicgen class)
15211524 decoder_input_ids = Array . from ( {
1525+ // @ts -expect-error TS2339
15221526 length : batch_size * this . config . decoder . num_codebooks
15231527 } , ( ) => [ decoder_start_token_id ] ) ;
15241528
@@ -1848,11 +1852,13 @@ export class PreTrainedModel extends Callable {
18481852 async encode_image ( { pixel_values } ) {
18491853 // image_inputs === { pixel_values }
18501854 const features = ( await sessionRun ( this . sessions [ 'vision_encoder' ] , { pixel_values } ) ) . image_features ;
1855+ // @ts -expect-error TS2339
18511856 if ( ! this . config . num_image_tokens ) {
18521857 console . warn (
18531858 'The number of image tokens was not set in the model configuration. ' +
18541859 `Setting it to the number of features detected by the vision encoder (${ features . dims [ 1 ] } ).`
18551860 )
1861+ // @ts -expect-error TS2339
18561862 this . config . num_image_tokens = features . dims [ 1 ] ;
18571863 }
18581864 return features ;
@@ -3280,6 +3286,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
32803286
32813287 if ( generation_config . return_token_timestamps ) {
32823288 outputs [ "token_timestamps" ] = this . _extract_token_timestamps (
3289+ // @ts -expect-error TS2345
32833290 outputs ,
32843291 generation_config . alignment_heads ,
32853292 generation_config . num_frames ,
@@ -3315,6 +3322,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
33153322 ) ;
33163323 }
33173324
3325+ // @ts -expect-error TS2339
33183326 let median_filter_width = this . config . median_filter_width ;
33193327 if ( median_filter_width === undefined ) {
33203328 console . warn ( "Model config has no `median_filter_width`, using default value of 7." )
@@ -3325,6 +3333,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
33253333 const batch = generate_outputs . cross_attentions ;
33263334 // Create a list with `decoder_layers` elements, each a tensor of shape
33273335 // (batch size, attention_heads, output length, input length).
3336+ // @ts -expect-error TS2339
33283337 const cross_attentions = Array . from ( { length : this . config . decoder_layers } ,
33293338 // Concatenate the cross attentions for each layer across sequence length dimension.
33303339 ( _ , i ) => cat ( batch . map ( x => x [ i ] ) , 2 )
@@ -3468,6 +3477,7 @@ export class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
34683477 attention_mask,
34693478 } ) {
34703479
3480+ // @ts -expect-error TS2339
34713481 const image_token_index = this . config . image_token_index ;
34723482
34733483 const idsList = input_ids . tolist ( ) ;
@@ -6201,10 +6211,12 @@ export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
62016211
62026212 const { encoder_outputs, encoder_attention_mask } = await encoderForward ( this , model_inputs ) ;
62036213
6214+ // @ts -expect-error TS2339
62046215 const r = encoder_outputs . dims [ 1 ] / this . config . reduction_factor ;
62056216 const maxlen = Math . floor ( r * maxlenratio ) ;
62066217 const minlen = Math . floor ( r * minlenratio ) ;
62076218
6219+ // @ts -expect-error TS2339
62086220 const num_mel_bins = this . config . num_mel_bins ;
62096221
62106222 let spectrogramParts = [ ] ;
@@ -6569,11 +6581,13 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
65696581 */
65706582 _apply_and_filter_by_delay_pattern_mask ( outputs ) {
65716583 const [ bs_x_codebooks , seqLength ] = outputs . dims ;
6584+ // @ts -expect-error TS2339
65726585 const num_codebooks = this . config . decoder . num_codebooks ;
65736586 const upperBound = ( seqLength - num_codebooks ) ;
65746587
65756588 let newDataSize = 0 ;
65766589 for ( let i = 0 ; i < outputs . size ; ++ i ) {
6590+ // @ts -expect-error TS2339
65776591 if ( outputs . data [ i ] === this . config . decoder . pad_token_id ) {
65786592 continue ;
65796593 }
@@ -6603,7 +6617,9 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
66036617 let clonedInputIds = structuredClone ( input_ids ) ;
66046618 for ( let i = 0 ; i < clonedInputIds . length ; ++ i ) {
66056619 for ( let j = 0 ; j < clonedInputIds [ i ] . length ; ++ j ) {
6620+ // @ts -expect-error TS2339
66066621 if ( ( i % this . config . decoder . num_codebooks ) >= j ) {
6622+ // @ts -expect-error TS2339
66076623 clonedInputIds [ i ] [ j ] = BigInt ( this . config . decoder . pad_token_id ) ;
66086624 }
66096625 }
@@ -6760,6 +6776,9 @@ export class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
67606776 'past_key_values' ,
67616777 ] ;
67626778
6779+ /**
6780+ * @param {ConstructorParameters<typeof MultiModalityPreTrainedModel> } args
6781+ */
67636782 constructor ( ...args ) {
67646783 super ( ...args ) ;
67656784
0 commit comments