@@ -1995,12 +1995,16 @@ export class PreTrainedTokenizer extends Callable {
19951995 }
19961996 }
19971997
1998+ // Update additional_special_tokens
1999+ this . special_tokens . push ( ...( tokenizerConfig . additional_special_tokens ?? [ ] ) ) ;
2000+ this . special_tokens = [ ...new Set ( this . special_tokens ) ] ; // Remove duplicates
2001+
19982002 // Slight hack, but it prevents code duplication:
19992003 this . decoder . added_tokens = this . added_tokens ;
20002004
2001- this . added_tokens_regex = new RegExp (
2005+ this . added_tokens_regex = this . added_tokens . length > 0 ? new RegExp (
20022006 '(' + this . added_tokens . map ( escapeRegExp ) . join ( '|' ) + ')'
2003- ) ;
2007+ ) : null ;
20042008
20052009 // Set mask token if present (otherwise will be undefined, which is fine)
20062010 this . mask_token = this . getToken ( tokenizerConfig , 'mask_token' ) ;
@@ -2265,8 +2269,7 @@ export class PreTrainedTokenizer extends Callable {
22652269 // Actual function which does encoding, for a single text
22662270 // First, we take care of special tokens. Needed to avoid issues arising from
22672271 // normalization and/or pretokenization (which may not preserve special tokens)
2268- const sections = text . split ( this . added_tokens_regex ) . filter ( x => x ) ;
2269-
2272+ const sections = this . added_tokens_regex ? text . split ( this . added_tokens_regex ) . filter ( x => x ) : [ text ] ;
22702273 let tokens = sections . map ( x => {
22712274 if ( this . added_tokens . includes ( x ) ) {
22722275 // Ignore added tokens
@@ -2482,6 +2485,58 @@ export class FalconTokenizer extends PreTrainedTokenizer {
24822485
24832486export class GPTNeoXTokenizer extends PreTrainedTokenizer { }
24842487
2488+
2489+ /**
2490+ * Helper function to build translation inputs for an `NllbTokenizer` or `M2M100Tokenizer`.
2491+ * @param {PreTrainedTokenizer } self The tokenizer instance.
2492+ * @param {string|string[] } raw_inputs The text to tokenize.
2493+ * @param {Object } tokenizer_options Options to be sent to the tokenizer
2494+ * @param {Object } generate_kwargs Generation options.
2495+ * @returns {Object } Object to be passed to the model.
2496+ * @private
2497+ */
2498+ function _build_translation_inputs ( self , raw_inputs , tokenizer_options , generate_kwargs ) {
2499+ if ( ! ( 'language_codes' in self ) || ! Array . isArray ( self . language_codes ) ) {
2500+ throw new Error ( 'Tokenizer must have `language_codes` attribute set and it should be an array of language ids.' )
2501+ }
2502+ if ( ! ( 'languageRegex' in self ) || ! ( self . languageRegex instanceof RegExp ) ) {
2503+ throw new Error ( 'Tokenizer must have `languageRegex` attribute set and it should be a regular expression.' )
2504+ }
2505+ if ( ! ( 'lang_to_token' in self ) || typeof self . lang_to_token !== 'function' ) {
2506+ throw new Error ( 'Tokenizer must have `lang_to_token` attribute set and it should be a function.' )
2507+ }
2508+ const src_lang_token = generate_kwargs . src_lang ;
2509+ const tgt_lang_token = generate_kwargs . tgt_lang ;
2510+
2511+ // Check that the target language is valid:
2512+ if ( ! self . language_codes . includes ( tgt_lang_token ) ) {
2513+ throw new Error ( `Target language code "${ tgt_lang_token } " is not valid. Must be one of: {${ self . language_codes . join ( ', ' ) } }` ) ;
2514+ }
2515+
2516+ // Allow `src_lang` to be optional. If not set, we'll use the tokenizer's default.
2517+ if ( src_lang_token !== undefined ) {
2518+ // Check that the source language is valid:
2519+ if ( ! self . language_codes . includes ( src_lang_token ) ) {
2520+ throw new Error ( `Source language code "${ src_lang_token } " is not valid. Must be one of: {${ self . language_codes . join ( ', ' ) } }` ) ;
2521+ }
2522+
2523+ // In the same way as the Python library, we override the post-processor
2524+ // to force the source language to be first:
2525+ for ( let item of self . post_processor . config . single ) {
2526+ if ( 'SpecialToken' in item && self . languageRegex . test ( item . SpecialToken . id ) ) {
2527+ item . SpecialToken . id = self . lang_to_token ( src_lang_token ) ;
2528+ break ;
2529+ }
2530+ }
2531+ // TODO: Do the same for pair?
2532+ }
2533+
2534+ // Override the `forced_bos_token_id` to force the correct language
2535+ generate_kwargs . forced_bos_token_id = self . model . convert_tokens_to_ids ( [ self . lang_to_token ( tgt_lang_token ) ] ) [ 0 ] ;
2536+
2537+ return self . _call ( raw_inputs , tokenizer_options ) ;
2538+ }
2539+
24852540/**
24862541 * The NllbTokenizer class is used to tokenize text for NLLB ("No Language Left Behind") models.
24872542 *
@@ -2502,6 +2557,7 @@ export class NllbTokenizer extends PreTrainedTokenizer {
25022557
25032558 this . languageRegex = / ^ [ a - z ] { 3 } _ [ A - Z ] [ a - z ] { 3 } $ / ;
25042559 this . language_codes = this . special_tokens . filter ( x => this . languageRegex . test ( x ) ) ;
2560+ this . lang_to_token = x => x ; // Identity function
25052561 }
25062562
25072563 /**
@@ -2512,34 +2568,40 @@ export class NllbTokenizer extends PreTrainedTokenizer {
25122568 * @returns {Object } Object to be passed to the model.
25132569 */
25142570 _build_translation_inputs ( raw_inputs , tokenizer_options , generate_kwargs ) {
2571+ return _build_translation_inputs ( this , raw_inputs , tokenizer_options , generate_kwargs ) ;
2572+ }
2573+ }
25152574
2575+ /**
2576+ * The M2M100Tokenizer class is used to tokenize text for M2M100 ("Many-to-Many") models.
2577+ *
2578+ * M2M100 is a multilingual encoder-decoder (seq-to-seq) model trained for Many-to-Many
2579+ * multilingual translation. It was introduced in this [paper](https://arxiv.org/abs/2010.11125)
2580+ * and first released in [this](https://github.com/pytorch/fairseq/tree/master/examples/m2m_100) repository.
2581+ *
2582+ * For a list of supported languages (along with their language codes),
2583+ * @see {@link https://huggingface.co/facebook/m2m100_418M#languages-covered }
2584+ */
2585+ export class M2M100Tokenizer extends PreTrainedTokenizer {
2586+ constructor ( tokenizerJSON , tokenizerConfig ) {
2587+ super ( tokenizerJSON , tokenizerConfig ) ;
25162588
2517- // Check that the target language is valid:
2518- if ( ! this . language_codes . includes ( generate_kwargs . tgt_lang ) ) {
2519- throw new Error ( `Target language code "${ generate_kwargs . tgt_lang } " is not valid. Must be one of: {${ this . language_codes . join ( ', ' ) } }` ) ;
2520- }
2521-
2522- // Allow `src_lang` to be optional. If not set, we'll use the tokenizer's default.
2523- if ( generate_kwargs . src_lang !== undefined ) {
2524- // Check that the source language is valid:
2525- if ( ! this . language_codes . includes ( generate_kwargs . src_lang ) ) {
2526- throw new Error ( `Source language code "${ generate_kwargs . src_lang } " is not valid. Must be one of: {${ this . language_codes . join ( ', ' ) } }` ) ;
2527- }
2528-
2529- // In the same way as the Python library, we override the post-processor
2530- // to force the source language to be first:
2531- for ( let item of this . post_processor . config . single ) {
2532- if ( 'SpecialToken' in item && this . languageRegex . test ( item . SpecialToken . id ) ) {
2533- item . SpecialToken . id = generate_kwargs . src_lang ;
2534- break ;
2535- }
2536- }
2537- }
2538-
2539- // Override the `forced_bos_token_id` to force the correct language
2540- generate_kwargs . forced_bos_token_id = this . model . convert_tokens_to_ids ( [ generate_kwargs . tgt_lang ] ) [ 0 ] ;
2589+ this . languageRegex = / ^ _ _ [ a - z ] { 2 , 3 } _ _ $ / ;
2590+ this . language_codes = this . special_tokens
2591+ . filter ( x => this . languageRegex . test ( x ) )
2592+ . map ( x => x . slice ( 2 , - 2 ) ) ;
2593+ this . lang_to_token = x => `__${ x } __` ;
2594+ }
25412595
2542- return this . _call ( raw_inputs , tokenizer_options ) ;
2596+ /**
2597+ * Helper function to build translation inputs for an `M2M100Tokenizer`.
2598+ * @param {string|string[] } raw_inputs The text to tokenize.
2599+ * @param {Object } tokenizer_options Options to be sent to the tokenizer
2600+ * @param {Object } generate_kwargs Generation options.
2601+ * @returns {Object } Object to be passed to the model.
2602+ */
2603+ _build_translation_inputs ( raw_inputs , tokenizer_options , generate_kwargs ) {
2604+ return _build_translation_inputs ( this , raw_inputs , tokenizer_options , generate_kwargs ) ;
25432605 }
25442606}
25452607
@@ -3485,6 +3547,7 @@ export class AutoTokenizer {
34853547 'MarianTokenizer' : MarianTokenizer ,
34863548 'BloomTokenizer' : BloomTokenizer ,
34873549 'NllbTokenizer' : NllbTokenizer ,
3550+ 'M2M100Tokenizer' : M2M100Tokenizer ,
34883551 'LlamaTokenizer' : LlamaTokenizer ,
34893552 'XLMRobertaTokenizer' : XLMRobertaTokenizer ,
34903553 'MPNetTokenizer' : MPNetTokenizer ,
0 commit comments