From ebc7792001874f9493634643fe6e628ce818711c Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Thu, 24 Sep 2020 15:08:21 +0000
Subject: [PATCH] Intel OpenVINO backend

---
 README.md                 |  17 +++-
 bonito/cli/basecaller.py  |   2 +
 bonito/cli/evaluate.py    |   3 +-
 bonito/crf/basecall.py    |   5 +-
 bonito/crf/model.py       |  67 ++++++++++++++-
 bonito/ctc/basecall.py    |   3 +-
 bonito/openvino/loader.py |  31 +++++++
 bonito/openvino/model.py  | 166 ++++++++++++++++++++++++++++++++++++++
 bonito/util.py            |  15 ++--
 setup.py                  |   3 +
 10 files changed, 296 insertions(+), 16 deletions(-)
 create mode 100644 bonito/openvino/loader.py
 create mode 100644 bonito/openvino/model.py

diff --git a/README.md b/README.md
index 77a9c273..87aa26f9 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Bonito
 
-[![PyPI version](https://badge.fury.io/py/ont-bonito.svg)](https://badge.fury.io/py/ont-bonito) 
+[![PyPI version](https://badge.fury.io/py/ont-bonito.svg)](https://badge.fury.io/py/ont-bonito)
 [![py36](https://img.shields.io/badge/python-3.6-brightgreen.svg)](https://img.shields.io/badge/python-3.6-brightgreen.svg)
 [![py37](https://img.shields.io/badge/python-3.7-brightgreen.svg)](https://img.shields.io/badge/python-3.7-brightgreen.svg)
 [![py38](https://img.shields.io/badge/python-3.8-brightgreen.svg)](https://img.shields.io/badge/python-3.8-brightgreen.svg)
@@ -35,6 +35,12 @@ The default `ont-bonito` package is built against CUDA 10.2 however CUDA 11.1 an
 $ pip install -f https://download.pytorch.org/whl/torch_stable.html ont-bonito-cuda111
 ```
 
+To optimize inference on CPU with Intel OpenVINO use `--use_openvino` flag:
+
+```bash
+$ bonito basecaller dna_r9.4.1 --reference reference.mmi --use_openvino --device=cpu /data/reads > basecalls.sam
+```
+
 ## Modified Bases
 
 Modified base calling is handled by [Remora](https://github.com/nanoporetech/remora).
@@ -54,7 +60,7 @@ $ bonito basecaller dna_r9.4.1 --save-ctc --reference reference.mmi /data/reads
 $ bonito train --directory /data/training/ctc-data /data/training/model-dir
 ```
 
-In addition to training a new model from scratch you can also easily fine tune one of the pretrained models.  
+In addition to training a new model from scratch you can also easily fine tune one of the pretrained models.
 
 ```bash
 bonito train --epochs 1 --lr 5e-4 --pretrained dna_r10.4_e8.1_sup@v3.4 --directory /data/training/ctc-data /data/training/fine-tuned-model
@@ -67,7 +73,7 @@ $ bonito download --training
 $ bonito train /data/training/model-dir
 ```
 
-All training calls use Automatic Mixed Precision to speed up training. To disable this, set the `--no-amp` flag to True. 
+All training calls use Automatic Mixed Precision to speed up training. To disable this, set the `--no-amp` flag to True.
 
 ## Developer Quickstart
 
@@ -81,6 +87,11 @@ $ source venv3/bin/activate
 (venv3) $ python setup.py develop
 ```
 
+To build with OpenVINO backend:
+```bash
+(venv3) $ pip install develop .[openvino]
+```
+
 ## Interface
 
  - `bonito view` - view a model architecture for a given `.toml` file and the number of parameters in the network.
diff --git a/bonito/cli/basecaller.py b/bonito/cli/basecaller.py
index d2e7bdf4..58833649 100644
--- a/bonito/cli/basecaller.py
+++ b/bonito/cli/basecaller.py
@@ -40,6 +40,7 @@ def main(args):
             batchsize=args.batchsize,
             quantize=args.quantize,
             use_koi=True,
+            use_openvino=args.use_openvino,
         )
     except FileNotFoundError:
         sys.stderr.write(f"> error: failed to load {args.model_directory}\n")
@@ -172,4 +173,5 @@ def argparser():
     parser.add_argument("--batchsize", default=None, type=int)
     parser.add_argument("--max-reads", default=0, type=int)
     parser.add_argument('-v', '--verbose', action='count', default=0)
+    parser.add_argument("--use_openvino", action="store_true", default=False)
     return parser
diff --git a/bonito/cli/evaluate.py b/bonito/cli/evaluate.py
index c81775c3..4d568793 100644
--- a/bonito/cli/evaluate.py
+++ b/bonito/cli/evaluate.py
@@ -45,7 +45,7 @@ def main(args):
         seqs = []
 
         print("* loading model", w)
-        model = load_model(args.model_directory, args.device, weights=w)
+        model = load_model(args.model_directory, args.device, weights=w, use_openvino=args.use_openvino)
 
         print("* calling")
         t0 = time.perf_counter()
@@ -109,4 +109,5 @@ def argparser():
     parser.add_argument("--beamsize", default=5, type=int)
     parser.add_argument("--poa", action="store_true", default=False)
     parser.add_argument("--min-coverage", default=0.5, type=float)
+    parser.add_argument("--use_openvino", action="store_true", default=False)
     return parser
diff --git a/bonito/crf/basecall.py b/bonito/crf/basecall.py
index 2d037f4a..7abae212 100644
--- a/bonito/crf/basecall.py
+++ b/bonito/crf/basecall.py
@@ -28,12 +28,13 @@ def compute_scores(model, batch, beam_width=32, beam_cut=100.0, scale=1.0, offse
     """
     with torch.inference_mode():
         device = next(model.parameters()).device
-        dtype = torch.float16 if half_supported() else torch.float32
+        dtype = torch.float16 if device != torch.device('cpu') and half_supported() else torch.float32
         scores = model(batch.to(dtype).to(device))
         if reverse:
             scores = model.seqdist.reverse_complement(scores)
+        # beam_search expects scores in FP16 precision
         sequence, qstring, moves = beam_search(
-            scores, beam_width=beam_width, beam_cut=beam_cut,
+            scores.to(torch.float16), beam_width=beam_width, beam_cut=beam_cut,
             scale=scale, offset=offset, blank_score=blank_score
         )
         return {
diff --git a/bonito/crf/model.py b/bonito/crf/model.py
index 615dd432..10359600 100644
--- a/bonito/crf/model.py
+++ b/bonito/crf/model.py
@@ -6,8 +6,9 @@
 import numpy as np
 from bonito.nn import Module, Convolution, LinearCRFEncoder, Serial, Permute, layers, from_dict
 
-import seqdist.sparse
-from seqdist.ctc_simple import logZ_cupy, viterbi_alignments
+if torch.cuda.is_available():
+    import seqdist.sparse
+    from seqdist.ctc_simple import logZ_cupy, viterbi_alignments
 from seqdist.core import SequenceDist, Max, Log, semiring
 
 
@@ -21,6 +22,58 @@ def get_stride(m):
     return 1
 
 
+def logZ_fwd_cpu(Ms, idx, v0, vT, S):
+    T, N, C, NZ = Ms.shape
+    Ms_grad = torch.zeros(T, N, C, NZ)
+
+    a = v0
+    for t in range(T):
+        s = S.mul(a[:, idx], Ms[t])
+        a = S.sum(s, -1)
+        Ms_grad[t] = s
+    return S.sum(a + vT, dim=1), Ms_grad
+
+
+def logZ_bwd_cpu(Ms, idx, vT, S, K=1):
+    assert(K == 1)
+    T, N, C, NZ = Ms.shape
+    Ms = Ms.reshape(T, N, -1)
+    idx_T = idx.flatten().argsort().to(dtype=torch.long).reshape(C, NZ)
+
+    betas = torch.ones(T + 1, N, C)
+
+    a = vT
+    betas[T] = a
+    for t in reversed(range(T)):
+        s = S.mul(a[:, idx_T // NZ], Ms[t, :, idx_T])
+        a = S.sum(s, -1)
+        betas[t] = a
+    return betas
+
+
+class _LogZ(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, Ms, idx, v0, vT, S:semiring):
+        idx = idx.to(dtype=torch.long, device=Ms.device)
+        logZ, Ms_grad = logZ_fwd_cpu(Ms, idx, v0, vT, S)
+        ctx.save_for_backward(Ms_grad, Ms, idx, vT)
+        ctx.semiring = S
+        return logZ
+
+    @staticmethod
+    def backward(ctx, grad):
+        Ms_grad, Ms, idx, vT = ctx.saved_tensors
+        S = ctx.semiring
+        T, N, C, NZ = Ms.shape
+        betas = logZ_bwd_cpu(Ms, idx, vT, S)
+        Ms_grad = S.mul(Ms_grad, betas[1:,:,:,None])
+        Ms_grad = S.dsum(Ms_grad.reshape(T, N, -1), dim=2).reshape(T, N, C, NZ)
+        return grad[None, :, None, None] * Ms_grad, None, None, None, None, None
+
+def sparse_logZ(Ms, idx, v0, vT, S:semiring=Log):
+    return _LogZ.apply(Ms, idx, v0, vT, S)
+
+
 class CTC_CRF(SequenceDist):
 
     def __init__(self, state_len, alphabet):
@@ -43,7 +96,10 @@ def logZ(self, scores, S:semiring=Log):
         Ms = scores.reshape(T, N, -1, len(self.alphabet))
         alpha_0 = Ms.new_full((N, self.n_base**(self.state_len)), S.one)
         beta_T = Ms.new_full((N, self.n_base**(self.state_len)), S.one)
-        return seqdist.sparse.logZ(Ms, self.idx, alpha_0, beta_T, S)
+        if not Ms.device.index is None:
+            return seqdist.sparse.logZ(Ms, self.idx, alpha_0, beta_T, S)
+        else:
+            return sparse_logZ(Ms, self.idx, alpha_0, beta_T, S)
 
     def normalise(self, scores):
         return (scores - self.logZ(scores)[:, None] / len(scores))
@@ -58,7 +114,10 @@ def backward_scores(self, scores, S: semiring=Log):
         T, N, _ = scores.shape
         Ms = scores.reshape(T, N, -1, self.n_base + 1)
         beta_T = Ms.new_full((N, self.n_base**(self.state_len)), S.one)
-        return seqdist.sparse.bwd_scores_cupy(Ms, self.idx, beta_T, S, K=1)
+        if not Ms.device.index is None:
+            return seqdist.sparse.bwd_scores_cupy(Ms, self.idx, beta_T, S, K=1)
+        else:
+            return logZ_bwd_cpu(Ms, self.idx, beta_T, S, K=1)
 
     def compute_transition_probs(self, scores, betas):
         T, N, C = scores.shape
diff --git a/bonito/ctc/basecall.py b/bonito/ctc/basecall.py
index 9b6f780a..92692156 100644
--- a/bonito/ctc/basecall.py
+++ b/bonito/ctc/basecall.py
@@ -35,7 +35,8 @@ def compute_scores(model, batch):
     """
     with torch.no_grad():
         device = next(model.parameters()).device
-        chunks = batch.to(torch.half).to(device)
+        chunks = batch.to(torch.half) if device != torch.device('cpu') and half_supported() else batch
+        chunks = chunks.to(device)
         probs = permute(model(chunks), 'TNC', 'NTC')
     return probs.cpu().to(torch.float32)
 
diff --git a/bonito/openvino/loader.py b/bonito/openvino/loader.py
new file mode 100644
index 00000000..e2ab6bbd
--- /dev/null
+++ b/bonito/openvino/loader.py
@@ -0,0 +1,31 @@
+import torch.nn as nn
+
+
+def convert_to_2d(model):
+    for name, l in model.named_children():
+        layer_type = l.__class__.__name__
+        if layer_type == 'Conv1d':
+            new_layer = nn.Conv2d(l.in_channels, l.out_channels,
+                                  (1, l.kernel_size[0]), (1, l.stride[0]),
+                                  (0, l.padding[0]), (1, l.dilation[0]),
+                                  l.groups, False if l.bias is None else True, l.padding_mode)
+            params = l.state_dict()
+            params['weight'] = params['weight'].unsqueeze(2)
+            new_layer.load_state_dict(params)
+            setattr(model, name, new_layer)
+        elif layer_type == 'BatchNorm1d':
+            new_layer = nn.BatchNorm2d(l.num_features, l.eps)
+            new_layer.load_state_dict(l.state_dict())
+            new_layer.eval()
+            setattr(model, name, new_layer)
+        elif layer_type == 'Permute':
+            dims_2d = []
+            # 1D to 2D: i.e. (2, 0, 1) -> (2, 3, 0, 1)
+            for d in l.dims:
+                assert(d <= 2)
+                dims_2d.append(d)
+                if d == 2:
+                    dims_2d.append(3)
+            l.dims = dims_2d
+        else:
+            convert_to_2d(l)
diff --git a/bonito/openvino/model.py b/bonito/openvino/model.py
new file mode 100644
index 00000000..b2fa0f7c
--- /dev/null
+++ b/bonito/openvino/model.py
@@ -0,0 +1,166 @@
+import os
+import io
+import numpy as np
+import torch
+
+try:
+    from openvino.inference_engine import IECore, StatusCode
+    from .loader import convert_to_2d
+except ImportError:
+    pass
+
+
+def load_openvino_model(model, dirname):
+    package = model.config['model']['package']
+    if package == 'bonito.ctc':
+        return OpenVINOCTCModel(model, dirname)
+    elif package == 'bonito.crf':
+        return OpenVINOCRFModel(model, dirname)
+    else:
+        raise Exception('Unknown model configuration: ' + package)
+
+
+class OpenVINOModel:
+
+    def __init__(self, model, dirname):
+        self.model = model
+        self.alphabet = model.alphabet
+        self.parameters = model.parameters
+        self.stride = model.stride
+        self.net = None
+        self.exec_net = None
+        self.dirname = dirname
+        self.ie = IECore()
+
+
+    def eval(self):
+        pass
+
+
+    def half(self):
+        return self
+
+
+    @property
+    def config(self):
+        return self.model.config
+
+
+    def to(self, device):
+        self.device = str(device).upper()
+
+    """
+    Call this method once to initialize executable network
+    """
+    def init_model(self, model, inp_shape):
+        # First, we try to check if there is IR on disk. If not - load model in runtime
+        xml_path, bin_path = [os.path.join(self.dirname, 'model') + ext for ext in ['.xml', '.bin']]
+        if os.path.exists(xml_path) and os.path.exists(bin_path):
+            self.net = self.ie.read_network(xml_path, bin_path)
+        else:
+            # Convert model to ONNX buffer
+            buf = io.BytesIO()
+            inp = torch.randn(inp_shape)
+            torch.onnx.export(model, inp, buf, input_names=['input'], output_names=['output'],
+                              opset_version=11)
+
+            # Import network from memory buffer
+            self.net = self.ie.read_network(buf.getvalue(), b'', init_from_buffer=True)
+
+        # Load model to device
+        config = {}
+        if self.device == 'CPU':
+            config={'CPU_THROUGHPUT_STREAMS': 'CPU_THROUGHPUT_AUTO'}
+        self.exec_net = self.ie.load_network(self.net, self.device,
+                                             config=config, num_requests=0)
+
+
+    def process(self, data):
+        data = data.float()
+        batch_size = data.shape[0]
+        inp_shape = list(data.shape)
+        inp_shape[0] = 1  # We will run the batch asynchronously
+
+        # List that maps infer requests to index of processed chunk from batch.
+        # -1 means that request has not been started yet.
+        infer_request_input_id = [-1] * len(self.exec_net.requests)
+        out_shape = self.net.outputs['output'].shape
+        # CTC network produces 1xWxNxC
+        output = np.zeros([out_shape[-3], batch_size, out_shape[-1]], dtype=np.float32)
+
+        for inp_id in range(batch_size):
+            # Get idle infer request
+            infer_request_id = self.exec_net.get_idle_request_id()
+            if infer_request_id < 0:
+                status = self.exec_net.wait(num_requests=1)
+                if status != StatusCode.OK:
+                    raise Exception("Wait for idle request failed!")
+                infer_request_id = self.exec_net.get_idle_request_id()
+                if infer_request_id < 0:
+                    raise Exception("Invalid request id!")
+
+            out_id = infer_request_input_id[infer_request_id]
+            request = self.exec_net.requests[infer_request_id]
+
+            # Copy output prediction
+            if out_id != -1:
+                output[:,out_id:out_id+1] = request.output_blobs['output'].buffer
+
+            # Start this request on new data
+            infer_request_input_id[infer_request_id] = inp_id
+            request.async_infer({'input': data[inp_id]})
+            inp_id += 1
+
+        # Wait for the rest of requests
+        status = self.exec_net.wait()
+        if status != StatusCode.OK:
+            raise Exception("Wait for idle request failed!")
+        for infer_request_id, out_id in enumerate(infer_request_input_id):
+            if out_id == -1:
+                continue
+            request = self.exec_net.requests[infer_request_id]
+            output[:,out_id:out_id+1] = request.output_blobs['output'].buffer
+
+        return torch.tensor(output)
+
+
+class OpenVINOCTCModel(OpenVINOModel):
+
+    def __init__(self, model, dirname):
+        super().__init__(model, dirname)
+
+
+    def __call__(self, data):
+        data = data.unsqueeze(2)  # 1D->2D
+        if self.exec_net is None:
+            convert_to_2d(self.model)
+            self.init_model(self.model, [1, 1, 1, data.shape[-1]])
+
+        return self.process(data)
+
+
+    def decode(self, x, beamsize=5, threshold=1e-3, qscores=False, return_path=False):
+        return self.model.decode(x, beamsize=beamsize, threshold=threshold,
+                                 qscores=qscores, return_path=return_path)
+
+
+class OpenVINOCRFModel(OpenVINOModel):
+
+    def __init__(self, model, dirname):
+        super().__init__(model, dirname)
+        self.seqdist = model.seqdist
+
+
+    def __call__(self, data):
+        if self.exec_net is None:
+            self.init_model(self.model.encoder, [1, 1, data.shape[-1]])
+
+        return self.process(data)
+
+
+    def decode(self, x):
+        return self.model.decode(x)
+
+
+    def decode_batch(self, x):
+        return self.model.decode_batch(x)
diff --git a/bonito/util.py b/bonito/util.py
index dc971665..b1fed7a2 100644
--- a/bonito/util.py
+++ b/bonito/util.py
@@ -19,6 +19,7 @@
 import parasail
 import numpy as np
 from torch.cuda import get_device_capability
+from bonito.openvino.model import load_openvino_model
 
 try:
     from claragenomics.bindings import cuda
@@ -46,7 +47,7 @@ def init(seed, device):
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
-    if device == "cpu": return
+    if not device.startswith('cuda'): return
     torch.backends.cudnn.enabled = True
     torch.backends.cudnn.deterministic = True
     torch.backends.cudnn.benchmark = False
@@ -251,7 +252,7 @@ def match_names(state_dict, model):
     return OrderedDict([(k, remap[k]) for k in state_dict.keys()])
 
 
-def load_model(dirname, device, weights=None, half=None, chunksize=None, batchsize=None, overlap=None, quantize=False, use_koi=False):
+def load_model(dirname, device, weights=None, half=None, chunksize=None, batchsize=None, overlap=None, quantize=False, use_koi=False, use_openvino=False):
     """
     Load a model from disk
     """
@@ -264,7 +265,8 @@ def load_model(dirname, device, weights=None, half=None, chunksize=None, batchsi
             raise FileNotFoundError("no model weights found in '%s'" % dirname)
         weights = max([int(re.sub(".*_([0-9]+).tar", "\\1", w)) for w in weight_files])
 
-    device = torch.device(device)
+    if not use_openvino:
+        device = torch.device(device)
     config = toml.load(os.path.join(dirname, 'config.toml'))
     weights = os.path.join(dirname, 'weights_%s.tar' % weights)
 
@@ -285,7 +287,7 @@ def load_model(dirname, device, weights=None, half=None, chunksize=None, batchsi
             model.encoder, batchsize=batchsize, chunksize=chunksize // model.stride, quantize=quantize
         )
 
-    state_dict = torch.load(weights, map_location=device)
+    state_dict = torch.load(weights, map_location=device if not use_openvino else 'cpu')
     state_dict = {k2: state_dict[k1] for k1, k2 in match_names(state_dict, model).items()}
     new_state_dict = OrderedDict()
     for k, v in state_dict.items():
@@ -294,7 +296,10 @@ def load_model(dirname, device, weights=None, half=None, chunksize=None, batchsi
 
     model.load_state_dict(new_state_dict)
 
-    if half is None:
+    if use_openvino:
+        model = load_openvino_model(model, dirname)
+
+    if half is None and device != torch.device('cpu'):
         half = half_supported()
 
     if half: model = model.half()
diff --git a/setup.py b/setup.py
index 271fabd6..71304378 100644
--- a/setup.py
+++ b/setup.py
@@ -41,6 +41,9 @@
     packages=find_packages(),
     include_package_data=True,
     install_requires=requirements,
+    extras_require = {
+        'openvino': ['openvino==2021.4.2'],
+    },
     long_description=long_description,
     long_description_content_type='text/markdown',
     author='Oxford Nanopore Technologies, Ltd',