Support ONNX export

yqzhishen · yqzhishen · commit 1b2dc3566253 · 2023-10-06T20:23:35.000+08:00
diff --git a/deployment/__init__.py b/deployment/__init__.py
@@ -0,0 +1,8 @@
+from .base_onnx_module import BaseONNXModule
+from .me_onnx_module import MIDIExtractionONNXModule
+from .me_quant_onnx_module import QuantizedMIDIExtractionONNXModule
+
+task_module_mapping = {
+    'training.MIDIExtractionTask': 'deployment.MIDIExtractionONNXModule',
+    'training.QuantizedMIDIExtractionTask': 'deployment.QuantizedMIDIExtractionONNXModule',
+}
diff --git a/deployment/base_onnx_module.py b/deployment/base_onnx_module.py
@@ -0,0 +1,80 @@
+import pathlib
+from collections import OrderedDict
+
+from librosa.filters import mel
+import torch
+from torch import nn
+
+from utils import build_object_from_class_name
+
+
+class BaseONNXModule(nn.Module):
+    def __init__(self, config: dict, model_path: pathlib.Path, device=None):
+        super().__init__()
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.config = config
+        self.model_path = model_path
+        self.device = device
+        self.timestep = self.config['hop_size'] / self.config['audio_sample_rate']
+        self.model: torch.nn.Module = self.build_model()
+
+    def build_model(self) -> nn.Module:
+        model: nn.Module = build_object_from_class_name(
+            self.config['model_cls'], nn.Module, config=self.config
+        ).eval().to(self.device)
+        state_dict = torch.load(self.model_path, map_location=self.device)['state_dict']
+        prefix_in_ckpt = 'model'
+        state_dict = OrderedDict({
+            k[len(prefix_in_ckpt) + 1:]: v
+            for k, v in state_dict.items() if k.startswith(f'{prefix_in_ckpt}.')
+        })
+        model.load_state_dict(state_dict, strict=True)
+        print(f'| load \'{prefix_in_ckpt}\' from \'{self.model_path}\'.')
+        return model
+
+
+class MelSpectrogram_ONNX(nn.Module):
+    def __init__(
+            self,
+            n_mel_channels,
+            sampling_rate,
+            win_length,
+            hop_length,
+            n_fft=None,
+            mel_fmin=0,
+            mel_fmax=None,
+            clamp=1e-5
+    ):
+        super().__init__()
+        n_fft = win_length if n_fft is None else n_fft
+        mel_basis = mel(
+            sr=sampling_rate,
+            n_fft=n_fft,
+            n_mels=n_mel_channels,
+            fmin=mel_fmin,
+            fmax=mel_fmax,
+            htk=True)
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+        self.n_fft = win_length if n_fft is None else n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.sampling_rate = sampling_rate
+        self.n_mel_channels = n_mel_channels
+        self.clamp = clamp
+
+    def forward(self, audio, center=True):
+        fft = torch.stft(
+            audio,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=torch.hann_window(self.win_length, device=audio.device),
+            center=center,
+            return_complex=False
+        )
+        magnitude = torch.sqrt(torch.sum(fft ** 2, dim=-1))
+        mel_output = torch.matmul(self.mel_basis, magnitude)
+        log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
+        return log_mel_spec
diff --git a/deployment/me_onnx_module.py b/deployment/me_onnx_module.py
@@ -0,0 +1,39 @@
+import pathlib
+
+import torch
+
+from utils.infer_utils import decode_bounds_to_alignment, decode_gaussian_blurred_probs, decode_note_sequence
+from .base_onnx_module import BaseONNXModule, MelSpectrogram_ONNX
+
+
+class MIDIExtractionONNXModule(BaseONNXModule):
+    def __init__(self, config: dict, model_path: pathlib.Path, device=None):
+        super().__init__(config, model_path, device=device)
+        self.mel_extractor = MelSpectrogram_ONNX(
+            n_mel_channels=self.config['units_dim'], sampling_rate=self.config['audio_sample_rate'],
+            win_length=self.config['win_size'], hop_length=self.config['hop_size'],
+            mel_fmin=self.config['fmin'], mel_fmax=self.config['fmax']
+        ).to(self.device)
+        self.rmvpe = None
+        self.midi_min = self.config['midi_min']
+        self.midi_max = self.config['midi_max']
+        self.midi_deviation = self.config['midi_prob_deviation']
+        self.rest_threshold = self.config['rest_threshold']
+
+    def forward(self, waveform: torch.Tensor):
+        units = self.mel_extractor(waveform).transpose(1, 2)
+        pitch = torch.zeros(units.shape[:2], dtype=torch.float32, device=self.device)
+        masks = torch.ones_like(pitch, dtype=torch.bool)
+        probs, bounds = self.model(x=units, f0=pitch, mask=masks, sig=True)
+        probs *= masks[..., None]
+        bounds *= masks
+        unit2note_pred = decode_bounds_to_alignment(bounds, use_diff=False) * masks
+        midi_pred, rest_pred = decode_gaussian_blurred_probs(
+            probs, vmin=self.midi_min, vmax=self.midi_max,
+            deviation=self.midi_deviation, threshold=self.rest_threshold
+        )
+        note_midi_pred, note_dur_pred, note_mask_pred = decode_note_sequence(
+            unit2note_pred, midi_pred, ~rest_pred & masks
+        )
+        note_rest_pred = ~note_mask_pred
+        return note_midi_pred, note_rest_pred, note_dur_pred * self.timestep
diff --git a/deployment/me_quant_onnx_module.py b/deployment/me_quant_onnx_module.py
@@ -0,0 +1,33 @@
+import pathlib
+
+import torch
+
+from utils.infer_utils import decode_bounds_to_alignment, decode_note_sequence
+from .base_onnx_module import BaseONNXModule, MelSpectrogram_ONNX
+
+
+class QuantizedMIDIExtractionONNXModule(BaseONNXModule):
+    def __init__(self, config: dict, model_path: pathlib.Path, device=None):
+        super().__init__(config, model_path, device=device)
+        self.mel_extractor = MelSpectrogram_ONNX(
+            n_mel_channels=self.config['units_dim'], sampling_rate=self.config['audio_sample_rate'],
+            win_length=self.config['win_size'], hop_length=self.config['hop_size'],
+            mel_fmin=self.config['fmin'], mel_fmax=self.config['fmax']
+        ).to(self.device)
+        self.rmvpe = None
+
+    def forward(self, waveform: torch.Tensor):
+        units = self.mel_extractor(waveform).transpose(1, 2)
+        pitch = torch.zeros(units.shape[:2], dtype=torch.float32, device=self.device)
+        masks = torch.ones_like(pitch, dtype=torch.bool)
+        probs, bounds = self.model(x=units, f0=pitch, mask=masks, sig=True)
+        probs *= masks[..., None]
+        bounds *= masks
+        unit2note_pred = decode_bounds_to_alignment(bounds) * masks
+        midi_pred = probs.argmax(dim=-1)
+        rest_pred = midi_pred == 128
+        note_midi_pred, note_dur_pred, note_mask_pred = decode_note_sequence(
+            unit2note_pred, midi_pred.clip(min=0, max=127), ~rest_pred & masks
+        )
+        note_rest_pred = ~note_mask_pred
+        return note_midi_pred, note_rest_pred, note_dur_pred * self.timestep
diff --git a/export.py b/export.py
@@ -0,0 +1,110 @@
+import importlib
+import pathlib
+from typing import Dict, Tuple, Union
+
+import click
+import onnx
+import onnxsim
+import torch
+import yaml
+
+import deployment
+from utils.config_utils import print_config
+
+
+def onnx_override_io_shapes(
+        model,  # ModelProto
+        input_shapes: Dict[str, Tuple[Union[str, int]]] = None,
+        output_shapes: Dict[str, Tuple[Union[str, int]]] = None,
+):
+    """
+    Override the shapes of inputs/outputs of the model graph (in-place operation).
+    :param model: model to perform the operation on
+    :param input_shapes: a dict with keys as input/output names and values as shape tuples
+    :param output_shapes: the same as input_shapes
+    """
+    def _override_shapes(
+            shape_list_old,  # RepeatedCompositeFieldContainer[ValueInfoProto]
+            shape_dict_new: Dict[str, Tuple[Union[str, int]]]):
+        for value_info in shape_list_old:
+            if value_info.name in shape_dict_new:
+                name = value_info.name
+                dims = value_info.type.tensor_type.shape.dim
+                assert len(shape_dict_new[name]) == len(dims), \
+                    f'Number of given and existing dimensions mismatch: {name}'
+                for i, dim in enumerate(shape_dict_new[name]):
+                    if isinstance(dim, int):
+                        dims[i].dim_param = ''
+                        dims[i].dim_value = dim
+                    else:
+                        dims[i].dim_value = 0
+                        dims[i].dim_param = dim
+
+    if input_shapes is not None:
+        _override_shapes(model.graph.input, input_shapes)
+    if output_shapes is not None:
+        _override_shapes(model.graph.output, output_shapes)
+
+
+@click.command(help='Run inference with a trained model')
+@click.option('--model', required=True, metavar='CKPT_PATH', help='Path to the model checkpoint (*.ckpt)')
+@click.option('--out', required=False, metavar='ONNX_PATH', help='Path to the output model (*.onnx)')
+def export(model, out):
+    model_path = pathlib.Path(model)
+    with open(model_path.with_name('config.yaml'), 'r', encoding='utf8') as f:
+        config = yaml.safe_load(f)
+    print_config(config)
+    module_cls = deployment.task_module_mapping[config['task_cls']]
+
+    pkg = ".".join(module_cls.split(".")[:-1])
+    cls_name = module_cls.split(".")[-1]
+    module_cls = getattr(importlib.import_module(pkg), cls_name)
+    assert issubclass(module_cls, deployment.BaseONNXModule), \
+        f'Module class {module_cls} is not a subclass of {deployment.BaseONNXModule}.'
+    module_ins = module_cls(config=config, model_path=model_path)
+
+    waveform = torch.randn((1, 114514), dtype=torch.float32, device=module_ins.device)
+    out_path = pathlib.Path(out) if out is not None else model_path.with_suffix('.onnx')
+    torch.onnx.export(
+        module_ins,
+        waveform,
+        out_path,
+        input_names=['waveform'],
+        output_names=[
+            'note_midi',
+            'note_rest',
+            'note_dur'
+        ],
+        dynamic_axes={
+            'waveform': {
+                1: 'n_samples'
+            },
+            'note_midi': {
+                1: 'n_notes'
+            },
+            'note_rest': {
+                1: 'n_notes'
+            },
+            'note_dur': {
+                1: 'n_notes'
+            },
+        },
+        opset_version=17
+    )
+    onnx_model = onnx.load(out_path.as_posix())
+    onnx_override_io_shapes(onnx_model, output_shapes={
+        'note_midi': (1, 'n_notes'),
+        'note_rest': (1, 'n_notes'),
+        'note_dur': (1, 'n_notes'),
+    })
+    print('Running ONNX Simplifier...')
+    onnx_model, check = onnxsim.simplify(
+        onnx_model,
+        include_subgraph=True
+    )
+    assert check, 'Simplified ONNX model could not be validated'
+    onnx.save(onnx_model, out_path)
+
+
+if __name__ == '__main__':
+    export()
diff --git a/utils/infer_utils.py b/utils/infer_utils.py
@@ -7,7 +7,7 @@
 
 
 def decode_gaussian_blurred_probs(probs, vmin, vmax, deviation, threshold):
-    num_bins = probs.shape[-1]
+    num_bins = int(probs.shape[-1])
     interval = (vmax - vmin) / (num_bins - 1)
     width = int(3 * deviation / interval)  # 3 * sigma
     idx = torch.arange(num_bins, device=probs.device)[None, None, :]  # [1, 1, N]
@@ -24,14 +24,17 @@ def decode_gaussian_blurred_probs(probs, vmin, vmax, deviation, threshold):
     return values, rest
 
 
-def decode_bounds_to_alignment(bounds):
+def decode_bounds_to_alignment(bounds, use_diff=True):
     bounds_step = bounds.cumsum(dim=1).round().long()
-    bounds_inc = torch.diff(
-        bounds_step, dim=1, prepend=torch.full(
-            (bounds.shape[0], 1), fill_value=-1,
-            dtype=bounds_step.dtype, device=bounds_step.device
-        )
-    ) > 0
+    if use_diff:
+        bounds_inc = torch.diff(
+            bounds_step, dim=1, prepend=torch.full(
+                (bounds.shape[0], 1), fill_value=-1,
+                dtype=bounds_step.dtype, device=bounds_step.device
+            )
+        ) > 0
+    else:
+        bounds_inc = F.pad((bounds_step[:, 1:] > bounds_step[:, :-1]), [1, 0], value=True)
     frame2item = bounds_inc.long().cumsum(dim=1)
     return frame2item
 
@@ -48,25 +51,25 @@ def decode_note_sequence(frame2item, values, masks, threshold=0.5):
     b = frame2item.shape[0]
     space = frame2item.max() + 1
 
-    item_dur = frame2item.new_zeros(b, space).scatter_add(
+    item_dur = frame2item.new_zeros(b, space, dtype=frame2item.dtype).scatter_add(
         1, frame2item, torch.ones_like(frame2item)
     )[:, 1:]
-    item_unmasked_dur = frame2item.new_zeros(b, space).scatter_add(
+    item_unmasked_dur = frame2item.new_zeros(b, space, dtype=frame2item.dtype).scatter_add(
         1, frame2item, masks.long()
     )[:, 1:]
     item_masks = item_unmasked_dur / item_dur >= threshold
 
     values_quant = values.round().long()
-    histogram = frame2item.new_zeros(b, space * 128).scatter_add(
+    histogram = frame2item.new_zeros(b, space * 128, dtype=frame2item.dtype).scatter_add(
         1, frame2item * 128 + values_quant, torch.ones_like(frame2item) * masks
     ).unflatten(1, [space, 128])[:, 1:, :]
-    item_values_center = histogram.argmax(dim=2).to(dtype=values.dtype)
+    item_values_center = histogram.float().argmax(dim=2).to(dtype=values.dtype)
     values_center = torch.gather(F.pad(item_values_center, [1, 0]), 1, frame2item)
     values_near_center = masks & (values >= values_center - 0.5) & (values <= values_center + 0.5)
-    item_valid_dur = frame2item.new_zeros(b, space).scatter_add(
+    item_valid_dur = frame2item.new_zeros(b, space, dtype=frame2item.dtype).scatter_add(
         1, frame2item, values_near_center.long()
     )[:, 1:]
-    item_values = values.new_zeros(b, space).scatter_add(
+    item_values = values.new_zeros(b, space, dtype=values.dtype).scatter_add(
         1, frame2item, values * values_near_center
     )[:, 1:] / (item_valid_dur + (item_valid_dur == 0))