huggingface
diff --git a/‎package-lock.json‎
Lines changed: 479 additions & 505 deletions b/‎package-lock.json‎
Lines changed: 479 additions & 505 deletions
diff --git a/‎package.json‎
Lines changed: 5 additions & 1 deletion b/‎package.json‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎scripts/convert.py‎
Lines changed: 67 additions & 43 deletions b/‎scripts/convert.py‎
Lines changed: 67 additions & 43 deletions
diff --git a/‎scripts/extra/whisper.py‎
Lines changed: 76 additions & 0 deletions b/‎scripts/extra/whisper.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎scripts/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎scripts/requirements.txt‎
Lines changed: 1 addition & 0 deletions
@@ -1,6 +1,6 @@
 {
   "name": "@xenova/transformers",
-  "version": "2.3.1",
+  "version": "2.4.0",
   "description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
   "main": "./src/transformers.js",
   "types": "./types/transformers.d.ts",
@@ -57,6 +57,10 @@
     "webpack-cli": "^5.0.2",
     "webpack-dev-server": "^4.13.3"
   },
+  "overrides": {
+    "semver": "^7.5.4",
+    "protobufjs": "^7.2.4"
+  },
   "files": [
     "src",
     "dist",
 
@@ -11,7 +11,6 @@
     AutoTokenizer,
     HfArgumentParser
 )
-from transformers.utils import cached_file
 
 import onnx
 from optimum.exporters.onnx import main_export
@@ -21,6 +20,18 @@
     QuantType
 )
 
+DEFAULT_QUANTIZE_PARAMS = {
+    'per_channel': True,
+    'reduce_range': True,
+}
+
+MODEL_SPECIFIC_QUANTIZE_PARAMS = {
+    'whisper': {
+        'per_channel': False,
+        'reduce_range': False,
+    }
+}
+
 
 @dataclass
 class ConversionArguments:
@@ -79,18 +90,25 @@ class ConversionArguments:
     )
 
     per_channel: bool = field(
-        default=True,
+        default=None,
         metadata={
             "help": "Whether to quantize weights per channel"
         }
     )
     reduce_range: bool = field(
-        default=True,
+        default=None,
         metadata={
             "help": "Whether to quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode"
         }
     )
 
+    output_attentions: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to output attentions from the model. NOTE: This is only supported for whisper models right now."
+        }
+    )
+
 
 def get_operators(model: onnx.ModelProto) -> Set[str]:
     operators = set()
@@ -107,7 +125,7 @@ def traverse_graph(graph):
     return operators
 
 
-def quantize(model_names_or_paths, conv_args: ConversionArguments):
+def quantize(model_names_or_paths, **quantize_kwargs):
     """
     Quantize the weights of the model from float32 to int8 to allow very efficient inference on modern CPU
 
@@ -119,9 +137,8 @@ def quantize(model_names_or_paths, conv_args: ConversionArguments):
     Returns: The Path generated for the quantized
     """
 
-    quant_config = dict(
-        per_channel=conv_args.per_channel,
-        reduce_range=conv_args.reduce_range,
+    quantize_config = dict(
+        **quantize_kwargs,
         per_model_config={}
     )
 
@@ -148,34 +165,25 @@ def quantize(model_names_or_paths, conv_args: ConversionArguments):
             model_output=os.path.join(
                 directory_path, f'{file_name_without_extension}_quantized.onnx'),
 
-            per_channel=conv_args.per_channel,
-            reduce_range=conv_args.reduce_range,
-
             weight_type=weight_type,
             optimize_model=False,
 
             # TODO allow user to specify these
             # op_types_to_quantize=['MatMul', 'Add', 'Conv'],
             extra_options=dict(
                 EnableSubgraph=True
-            )
+            ),
+            **quantize_kwargs
         )
 
-        quant_config['per_model_config'][file_name_without_extension] = dict(
+        quantize_config['per_model_config'][file_name_without_extension] = dict(
             op_types=list(op_types),
             weight_type=str(weight_type),
         )
 
     # Save quantization config
-    with open(os.path.join(directory_path, 'quant_config.json'), 'w') as fp:
-        json.dump(quant_config, fp, indent=4)
-
-
-def copy_if_exists(model_path, file_name, destination):
-    file = cached_file(model_path, file_name,
-                       _raise_exceptions_for_missing_entries=False)
-    if file is not None:
-        shutil.copy(file, destination)
+    with open(os.path.join(directory_path, 'quantize_config.json'), 'w') as fp:
+        json.dump(quantize_config, fp, indent=4)
 
 
 def main():
@@ -192,35 +200,18 @@ def main():
     # Create output folder
     os.makedirs(output_model_folder, exist_ok=True)
 
-    # Copy certain JSON files, which save_pretrained doesn't handle
-    # copy_if_exists(model_id, 'tokenizer.json', output_model_folder)
-
-    # copy_if_exists(model_id, 'preprocessor_config.json', output_model_folder)
-    # copy_if_exists(model_id, 'generation_config.json', output_model_folder)
-
-    # # Saving the model config
+    # Saving the model config
     config = AutoConfig.from_pretrained(model_id)
-    # config.save_pretrained(output_model_folder)
 
+    tokenizer = None
     try:
         # Save tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_id)
-        # tokenizer.save_pretrained(output_model_folder)
-
-        # Handle special cases
-        if config.model_type == 'marian':
-            import json
-            from .extra.marian import generate_tokenizer_json
-            tokenizer_json = generate_tokenizer_json(model_id, tokenizer)
-
-            with open(os.path.join(output_model_folder, 'tokenizer.json'), 'w', encoding='utf-8') as fp:
-                json.dump(tokenizer_json, fp)
 
     except KeyError:
         pass  # No Tokenizer
 
-    # Step 1. convert huggingface model to onnx
-    main_export(
+    export_kwargs = dict(
         model_name_or_path=model_id,
         output=output_model_folder,
         task=conv_args.task,
@@ -229,21 +220,54 @@ def main():
         do_validation=not conv_args.skip_validation,
     )
 
+    # Handle special cases
+    if config.model_type == 'marian':
+        from .extra.marian import generate_tokenizer_json
+        tokenizer_json = generate_tokenizer_json(model_id, tokenizer)
+
+        with open(os.path.join(output_model_folder, 'tokenizer.json'), 'w', encoding='utf-8') as fp:
+            json.dump(tokenizer_json, fp)
+
+    elif config.model_type == 'whisper':
+        if conv_args.output_attentions:
+            from .extra.whisper import get_main_export_kwargs
+
+            export_kwargs.update(
+                **get_main_export_kwargs(config, "automatic-speech-recognition")
+            )
+    else:
+        pass  # TODO
+
+    # Step 1. convert huggingface model to onnx
+    main_export(**export_kwargs)
+
     # Step 2. (optional, recommended) quantize the converted model for fast inference and to reduce model size.
     if conv_args.quantize:
+        # Update quantize config with model specific defaults
+        quantize_config = MODEL_SPECIFIC_QUANTIZE_PARAMS.get(
+            config.model_type, DEFAULT_QUANTIZE_PARAMS)
+
         quantize([
             os.path.join(output_model_folder, x)
             for x in os.listdir(output_model_folder)
             if x.endswith('.onnx') and not x.endswith('_quantized.onnx')
-        ], conv_args)
+        ], **quantize_config)
 
     # Step 3. Move .onnx files to the 'onnx' subfolder
     os.makedirs(os.path.join(output_model_folder, 'onnx'), exist_ok=True)
     for file in os.listdir(output_model_folder):
-        if file.endswith('.onnx') or file.endswith('.onnx_data'):
+        if file.endswith(('.onnx', '.onnx_data')):
             shutil.move(os.path.join(output_model_folder, file),
                         os.path.join(output_model_folder, 'onnx', file))
 
+    # Step 4. Update the generation config if necessary
+    if config.model_type == 'whisper':
+        from transformers import GenerationConfig
+        from .extra.whisper import get_alignment_heads
+
+        generation_config = GenerationConfig.from_pretrained(model_id)
+        generation_config.alignment_heads = get_alignment_heads(config)
+        generation_config.save_pretrained(output_model_folder)
 
 if __name__ == '__main__':
     main()
@@ -0,0 +1,76 @@
+from optimum.exporters.onnx.model_configs import WhisperOnnxConfig
+
+from optimum.exporters.onnx.base import ConfigBehavior
+from typing import Dict
+
+# List of [layer, head] pairs that select the cross-attention heads that are highly correlated to word-level timing.
+# Source: https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a
+ALIGNMENT_HEADS_MAPPING = {
+    'whisper-tiny.en': [[1, 0], [2, 0], [2, 5], [3, 0], [3, 1], [3, 2], [3, 3], [3, 4]],
+    'whisper-tiny': [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]],
+    'whisper-base.en': [[3, 3], [4, 7], [5, 1], [5, 5], [5, 7]],
+    'whisper-base': [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]],
+    'whisper-small.en': [[6, 6], [7, 0], [7, 3], [7, 8], [8, 2], [8, 5], [8, 7], [9, 0], [9, 4], [9, 8], [9, 10], [10, 0], [10, 1], [10, 2], [10, 3], [10, 6], [10, 11], [11, 2], [11, 4]],
+    'whisper-small': [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]],
+    'whisper-medium.en': [[11, 4], [14, 1], [14, 12], [14, 14], [15, 4], [16, 0], [16, 4], [16, 9], [17, 12], [17, 14], [18, 7], [18, 10], [18, 15], [20, 0], [20, 3], [20, 9], [20, 14], [21, 12]],
+    'whisper-medium': [[13, 15], [15, 4], [15, 15], [16, 1], [20, 0], [23, 4]],
+    'whisper-large-v2': [[10, 12], [13, 17], [16, 11], [16, 12], [16, 13], [17, 15], [17, 16], [18, 4], [18, 11], [18, 19], [19, 11], [21, 2], [21, 3], [22, 3], [22, 9], [22, 12], [23, 5], [23, 7], [23, 13], [25, 5], [26, 1], [26, 12], [27, 15]],
+    'whisper-large': [[9, 19], [11, 2], [11, 4], [11, 17], [22, 7], [22, 11], [22, 17], [23, 2], [23, 15]],
+}
+
+
+class CustomWhisperOnnxConfig(WhisperOnnxConfig):
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        common_outputs = super().outputs
+
+        if self._behavior is ConfigBehavior.ENCODER:
+            for i in range(self._config.encoder_layers):
+                common_outputs[f"encoder_attentions.{i}"] = {0: "batch_size"}
+        elif self._behavior is ConfigBehavior.DECODER:
+            for i in range(self._config.decoder_layers):
+                common_outputs[f"decoder_attentions.{i}"] = {
+                    0: "batch_size", 3: "decoder_sequence_length"}
+            for i in range(self._config.decoder_layers):
+                common_outputs[f"cross_attentions.{i}"] = {
+                    0: "batch_size", 3: "cross_attention_length"}
+
+        return common_outputs
+
+    @property
+    def torch_to_onnx_output_map(self):
+        if self._behavior is ConfigBehavior.ENCODER:
+            # The encoder export uses WhisperEncoder that returns the key "attentions"
+            return {"attentions": "encoder_attentions"}
+        else:
+            return {}
+
+
+def get_main_export_kwargs(config, task):
+
+    custom_config = CustomWhisperOnnxConfig(config=config, task=task)
+
+    custom_onnx_configs = dict(
+        encoder_model=custom_config.with_behavior("encoder"),
+        decoder_model=custom_config.with_behavior("decoder", use_past=False),
+        decoder_with_past_model=custom_config.with_behavior(
+            "decoder", use_past=True),
+    )
+
+    return dict(
+        model_kwargs={"output_attentions": True},
+        custom_onnx_configs=custom_onnx_configs,
+    )
+
+
+def get_alignment_heads(config):
+    if getattr(config, '_name_or_path', None) is None:
+        raise ValueError(
+            "Unable to determine model type from config. Please specify `_name_or_path` in the config.")
+
+    for model_name, heads in ALIGNMENT_HEADS_MAPPING.items():
+        if model_name in config._name_or_path:
+            return heads
+
+    raise ValueError(
+        f"Unknown model type: {config._name_or_path}. Please add one of the following model types to `_name_or_path` in the config file: {list(ALIGNMENT_HEADS_MAPPING.keys())}")
@@ -1,3 +1,4 @@
 transformers[torch]@git+https://github.com/huggingface/transformers
 optimum[onnxruntime]@git+https://github.com/huggingface/optimum
 tqdm
+onnx==1.13.1