From ebc7792001874f9493634643fe6e628ce818711c Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Thu, 24 Sep 2020 15:08:21 +0000 Subject: [PATCH] Intel OpenVINO backend --- README.md | 17 +++- bonito/cli/basecaller.py | 2 + bonito/cli/evaluate.py | 3 +- bonito/crf/basecall.py | 5 +- bonito/crf/model.py | 67 ++++++++++++++- bonito/ctc/basecall.py | 3 +- bonito/openvino/loader.py | 31 +++++++ bonito/openvino/model.py | 166 ++++++++++++++++++++++++++++++++++++++ bonito/util.py | 15 ++-- setup.py | 3 + 10 files changed, 296 insertions(+), 16 deletions(-) create mode 100644 bonito/openvino/loader.py create mode 100644 bonito/openvino/model.py diff --git a/README.md b/README.md index 77a9c273..87aa26f9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Bonito -[![PyPI version](https://badge.fury.io/py/ont-bonito.svg)](https://badge.fury.io/py/ont-bonito) +[![PyPI version](https://badge.fury.io/py/ont-bonito.svg)](https://badge.fury.io/py/ont-bonito) [![py36](https://img.shields.io/badge/python-3.6-brightgreen.svg)](https://img.shields.io/badge/python-3.6-brightgreen.svg) [![py37](https://img.shields.io/badge/python-3.7-brightgreen.svg)](https://img.shields.io/badge/python-3.7-brightgreen.svg) [![py38](https://img.shields.io/badge/python-3.8-brightgreen.svg)](https://img.shields.io/badge/python-3.8-brightgreen.svg) @@ -35,6 +35,12 @@ The default `ont-bonito` package is built against CUDA 10.2 however CUDA 11.1 an $ pip install -f https://download.pytorch.org/whl/torch_stable.html ont-bonito-cuda111 ``` +To optimize inference on CPU with Intel OpenVINO use `--use_openvino` flag: + +```bash +$ bonito basecaller dna_r9.4.1 --reference reference.mmi --use_openvino --device=cpu /data/reads > basecalls.sam +``` + ## Modified Bases Modified base calling is handled by [Remora](https://github.com/nanoporetech/remora). @@ -54,7 +60,7 @@ $ bonito basecaller dna_r9.4.1 --save-ctc --reference reference.mmi /data/reads $ bonito train --directory /data/training/ctc-data /data/training/model-dir ``` -In addition to training a new model from scratch you can also easily fine tune one of the pretrained models. +In addition to training a new model from scratch you can also easily fine tune one of the pretrained models. ```bash bonito train --epochs 1 --lr 5e-4 --pretrained dna_r10.4_e8.1_sup@v3.4 --directory /data/training/ctc-data /data/training/fine-tuned-model @@ -67,7 +73,7 @@ $ bonito download --training $ bonito train /data/training/model-dir ``` -All training calls use Automatic Mixed Precision to speed up training. To disable this, set the `--no-amp` flag to True. +All training calls use Automatic Mixed Precision to speed up training. To disable this, set the `--no-amp` flag to True. ## Developer Quickstart @@ -81,6 +87,11 @@ $ source venv3/bin/activate (venv3) $ python setup.py develop ``` +To build with OpenVINO backend: +```bash +(venv3) $ pip install develop .[openvino] +``` + ## Interface - `bonito view` - view a model architecture for a given `.toml` file and the number of parameters in the network. diff --git a/bonito/cli/basecaller.py b/bonito/cli/basecaller.py index d2e7bdf4..58833649 100644 --- a/bonito/cli/basecaller.py +++ b/bonito/cli/basecaller.py @@ -40,6 +40,7 @@ def main(args): batchsize=args.batchsize, quantize=args.quantize, use_koi=True, + use_openvino=args.use_openvino, ) except FileNotFoundError: sys.stderr.write(f"> error: failed to load {args.model_directory}\n") @@ -172,4 +173,5 @@ def argparser(): parser.add_argument("--batchsize", default=None, type=int) parser.add_argument("--max-reads", default=0, type=int) parser.add_argument('-v', '--verbose', action='count', default=0) + parser.add_argument("--use_openvino", action="store_true", default=False) return parser diff --git a/bonito/cli/evaluate.py b/bonito/cli/evaluate.py index c81775c3..4d568793 100644 --- a/bonito/cli/evaluate.py +++ b/bonito/cli/evaluate.py @@ -45,7 +45,7 @@ def main(args): seqs = [] print("* loading model", w) - model = load_model(args.model_directory, args.device, weights=w) + model = load_model(args.model_directory, args.device, weights=w, use_openvino=args.use_openvino) print("* calling") t0 = time.perf_counter() @@ -109,4 +109,5 @@ def argparser(): parser.add_argument("--beamsize", default=5, type=int) parser.add_argument("--poa", action="store_true", default=False) parser.add_argument("--min-coverage", default=0.5, type=float) + parser.add_argument("--use_openvino", action="store_true", default=False) return parser diff --git a/bonito/crf/basecall.py b/bonito/crf/basecall.py index 2d037f4a..7abae212 100644 --- a/bonito/crf/basecall.py +++ b/bonito/crf/basecall.py @@ -28,12 +28,13 @@ def compute_scores(model, batch, beam_width=32, beam_cut=100.0, scale=1.0, offse """ with torch.inference_mode(): device = next(model.parameters()).device - dtype = torch.float16 if half_supported() else torch.float32 + dtype = torch.float16 if device != torch.device('cpu') and half_supported() else torch.float32 scores = model(batch.to(dtype).to(device)) if reverse: scores = model.seqdist.reverse_complement(scores) + # beam_search expects scores in FP16 precision sequence, qstring, moves = beam_search( - scores, beam_width=beam_width, beam_cut=beam_cut, + scores.to(torch.float16), beam_width=beam_width, beam_cut=beam_cut, scale=scale, offset=offset, blank_score=blank_score ) return { diff --git a/bonito/crf/model.py b/bonito/crf/model.py index 615dd432..10359600 100644 --- a/bonito/crf/model.py +++ b/bonito/crf/model.py @@ -6,8 +6,9 @@ import numpy as np from bonito.nn import Module, Convolution, LinearCRFEncoder, Serial, Permute, layers, from_dict -import seqdist.sparse -from seqdist.ctc_simple import logZ_cupy, viterbi_alignments +if torch.cuda.is_available(): + import seqdist.sparse + from seqdist.ctc_simple import logZ_cupy, viterbi_alignments from seqdist.core import SequenceDist, Max, Log, semiring @@ -21,6 +22,58 @@ def get_stride(m): return 1 +def logZ_fwd_cpu(Ms, idx, v0, vT, S): + T, N, C, NZ = Ms.shape + Ms_grad = torch.zeros(T, N, C, NZ) + + a = v0 + for t in range(T): + s = S.mul(a[:, idx], Ms[t]) + a = S.sum(s, -1) + Ms_grad[t] = s + return S.sum(a + vT, dim=1), Ms_grad + + +def logZ_bwd_cpu(Ms, idx, vT, S, K=1): + assert(K == 1) + T, N, C, NZ = Ms.shape + Ms = Ms.reshape(T, N, -1) + idx_T = idx.flatten().argsort().to(dtype=torch.long).reshape(C, NZ) + + betas = torch.ones(T + 1, N, C) + + a = vT + betas[T] = a + for t in reversed(range(T)): + s = S.mul(a[:, idx_T // NZ], Ms[t, :, idx_T]) + a = S.sum(s, -1) + betas[t] = a + return betas + + +class _LogZ(torch.autograd.Function): + @staticmethod + def forward(ctx, Ms, idx, v0, vT, S:semiring): + idx = idx.to(dtype=torch.long, device=Ms.device) + logZ, Ms_grad = logZ_fwd_cpu(Ms, idx, v0, vT, S) + ctx.save_for_backward(Ms_grad, Ms, idx, vT) + ctx.semiring = S + return logZ + + @staticmethod + def backward(ctx, grad): + Ms_grad, Ms, idx, vT = ctx.saved_tensors + S = ctx.semiring + T, N, C, NZ = Ms.shape + betas = logZ_bwd_cpu(Ms, idx, vT, S) + Ms_grad = S.mul(Ms_grad, betas[1:,:,:,None]) + Ms_grad = S.dsum(Ms_grad.reshape(T, N, -1), dim=2).reshape(T, N, C, NZ) + return grad[None, :, None, None] * Ms_grad, None, None, None, None, None + +def sparse_logZ(Ms, idx, v0, vT, S:semiring=Log): + return _LogZ.apply(Ms, idx, v0, vT, S) + + class CTC_CRF(SequenceDist): def __init__(self, state_len, alphabet): @@ -43,7 +96,10 @@ def logZ(self, scores, S:semiring=Log): Ms = scores.reshape(T, N, -1, len(self.alphabet)) alpha_0 = Ms.new_full((N, self.n_base**(self.state_len)), S.one) beta_T = Ms.new_full((N, self.n_base**(self.state_len)), S.one) - return seqdist.sparse.logZ(Ms, self.idx, alpha_0, beta_T, S) + if not Ms.device.index is None: + return seqdist.sparse.logZ(Ms, self.idx, alpha_0, beta_T, S) + else: + return sparse_logZ(Ms, self.idx, alpha_0, beta_T, S) def normalise(self, scores): return (scores - self.logZ(scores)[:, None] / len(scores)) @@ -58,7 +114,10 @@ def backward_scores(self, scores, S: semiring=Log): T, N, _ = scores.shape Ms = scores.reshape(T, N, -1, self.n_base + 1) beta_T = Ms.new_full((N, self.n_base**(self.state_len)), S.one) - return seqdist.sparse.bwd_scores_cupy(Ms, self.idx, beta_T, S, K=1) + if not Ms.device.index is None: + return seqdist.sparse.bwd_scores_cupy(Ms, self.idx, beta_T, S, K=1) + else: + return logZ_bwd_cpu(Ms, self.idx, beta_T, S, K=1) def compute_transition_probs(self, scores, betas): T, N, C = scores.shape diff --git a/bonito/ctc/basecall.py b/bonito/ctc/basecall.py index 9b6f780a..92692156 100644 --- a/bonito/ctc/basecall.py +++ b/bonito/ctc/basecall.py @@ -35,7 +35,8 @@ def compute_scores(model, batch): """ with torch.no_grad(): device = next(model.parameters()).device - chunks = batch.to(torch.half).to(device) + chunks = batch.to(torch.half) if device != torch.device('cpu') and half_supported() else batch + chunks = chunks.to(device) probs = permute(model(chunks), 'TNC', 'NTC') return probs.cpu().to(torch.float32) diff --git a/bonito/openvino/loader.py b/bonito/openvino/loader.py new file mode 100644 index 00000000..e2ab6bbd --- /dev/null +++ b/bonito/openvino/loader.py @@ -0,0 +1,31 @@ +import torch.nn as nn + + +def convert_to_2d(model): + for name, l in model.named_children(): + layer_type = l.__class__.__name__ + if layer_type == 'Conv1d': + new_layer = nn.Conv2d(l.in_channels, l.out_channels, + (1, l.kernel_size[0]), (1, l.stride[0]), + (0, l.padding[0]), (1, l.dilation[0]), + l.groups, False if l.bias is None else True, l.padding_mode) + params = l.state_dict() + params['weight'] = params['weight'].unsqueeze(2) + new_layer.load_state_dict(params) + setattr(model, name, new_layer) + elif layer_type == 'BatchNorm1d': + new_layer = nn.BatchNorm2d(l.num_features, l.eps) + new_layer.load_state_dict(l.state_dict()) + new_layer.eval() + setattr(model, name, new_layer) + elif layer_type == 'Permute': + dims_2d = [] + # 1D to 2D: i.e. (2, 0, 1) -> (2, 3, 0, 1) + for d in l.dims: + assert(d <= 2) + dims_2d.append(d) + if d == 2: + dims_2d.append(3) + l.dims = dims_2d + else: + convert_to_2d(l) diff --git a/bonito/openvino/model.py b/bonito/openvino/model.py new file mode 100644 index 00000000..b2fa0f7c --- /dev/null +++ b/bonito/openvino/model.py @@ -0,0 +1,166 @@ +import os +import io +import numpy as np +import torch + +try: + from openvino.inference_engine import IECore, StatusCode + from .loader import convert_to_2d +except ImportError: + pass + + +def load_openvino_model(model, dirname): + package = model.config['model']['package'] + if package == 'bonito.ctc': + return OpenVINOCTCModel(model, dirname) + elif package == 'bonito.crf': + return OpenVINOCRFModel(model, dirname) + else: + raise Exception('Unknown model configuration: ' + package) + + +class OpenVINOModel: + + def __init__(self, model, dirname): + self.model = model + self.alphabet = model.alphabet + self.parameters = model.parameters + self.stride = model.stride + self.net = None + self.exec_net = None + self.dirname = dirname + self.ie = IECore() + + + def eval(self): + pass + + + def half(self): + return self + + + @property + def config(self): + return self.model.config + + + def to(self, device): + self.device = str(device).upper() + + """ + Call this method once to initialize executable network + """ + def init_model(self, model, inp_shape): + # First, we try to check if there is IR on disk. If not - load model in runtime + xml_path, bin_path = [os.path.join(self.dirname, 'model') + ext for ext in ['.xml', '.bin']] + if os.path.exists(xml_path) and os.path.exists(bin_path): + self.net = self.ie.read_network(xml_path, bin_path) + else: + # Convert model to ONNX buffer + buf = io.BytesIO() + inp = torch.randn(inp_shape) + torch.onnx.export(model, inp, buf, input_names=['input'], output_names=['output'], + opset_version=11) + + # Import network from memory buffer + self.net = self.ie.read_network(buf.getvalue(), b'', init_from_buffer=True) + + # Load model to device + config = {} + if self.device == 'CPU': + config={'CPU_THROUGHPUT_STREAMS': 'CPU_THROUGHPUT_AUTO'} + self.exec_net = self.ie.load_network(self.net, self.device, + config=config, num_requests=0) + + + def process(self, data): + data = data.float() + batch_size = data.shape[0] + inp_shape = list(data.shape) + inp_shape[0] = 1 # We will run the batch asynchronously + + # List that maps infer requests to index of processed chunk from batch. + # -1 means that request has not been started yet. + infer_request_input_id = [-1] * len(self.exec_net.requests) + out_shape = self.net.outputs['output'].shape + # CTC network produces 1xWxNxC + output = np.zeros([out_shape[-3], batch_size, out_shape[-1]], dtype=np.float32) + + for inp_id in range(batch_size): + # Get idle infer request + infer_request_id = self.exec_net.get_idle_request_id() + if infer_request_id < 0: + status = self.exec_net.wait(num_requests=1) + if status != StatusCode.OK: + raise Exception("Wait for idle request failed!") + infer_request_id = self.exec_net.get_idle_request_id() + if infer_request_id < 0: + raise Exception("Invalid request id!") + + out_id = infer_request_input_id[infer_request_id] + request = self.exec_net.requests[infer_request_id] + + # Copy output prediction + if out_id != -1: + output[:,out_id:out_id+1] = request.output_blobs['output'].buffer + + # Start this request on new data + infer_request_input_id[infer_request_id] = inp_id + request.async_infer({'input': data[inp_id]}) + inp_id += 1 + + # Wait for the rest of requests + status = self.exec_net.wait() + if status != StatusCode.OK: + raise Exception("Wait for idle request failed!") + for infer_request_id, out_id in enumerate(infer_request_input_id): + if out_id == -1: + continue + request = self.exec_net.requests[infer_request_id] + output[:,out_id:out_id+1] = request.output_blobs['output'].buffer + + return torch.tensor(output) + + +class OpenVINOCTCModel(OpenVINOModel): + + def __init__(self, model, dirname): + super().__init__(model, dirname) + + + def __call__(self, data): + data = data.unsqueeze(2) # 1D->2D + if self.exec_net is None: + convert_to_2d(self.model) + self.init_model(self.model, [1, 1, 1, data.shape[-1]]) + + return self.process(data) + + + def decode(self, x, beamsize=5, threshold=1e-3, qscores=False, return_path=False): + return self.model.decode(x, beamsize=beamsize, threshold=threshold, + qscores=qscores, return_path=return_path) + + +class OpenVINOCRFModel(OpenVINOModel): + + def __init__(self, model, dirname): + super().__init__(model, dirname) + self.seqdist = model.seqdist + + + def __call__(self, data): + if self.exec_net is None: + self.init_model(self.model.encoder, [1, 1, data.shape[-1]]) + + return self.process(data) + + + def decode(self, x): + return self.model.decode(x) + + + def decode_batch(self, x): + return self.model.decode_batch(x) diff --git a/bonito/util.py b/bonito/util.py index dc971665..b1fed7a2 100644 --- a/bonito/util.py +++ b/bonito/util.py @@ -19,6 +19,7 @@ import parasail import numpy as np from torch.cuda import get_device_capability +from bonito.openvino.model import load_openvino_model try: from claragenomics.bindings import cuda @@ -46,7 +47,7 @@ def init(seed, device): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) - if device == "cpu": return + if not device.startswith('cuda'): return torch.backends.cudnn.enabled = True torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False @@ -251,7 +252,7 @@ def match_names(state_dict, model): return OrderedDict([(k, remap[k]) for k in state_dict.keys()]) -def load_model(dirname, device, weights=None, half=None, chunksize=None, batchsize=None, overlap=None, quantize=False, use_koi=False): +def load_model(dirname, device, weights=None, half=None, chunksize=None, batchsize=None, overlap=None, quantize=False, use_koi=False, use_openvino=False): """ Load a model from disk """ @@ -264,7 +265,8 @@ def load_model(dirname, device, weights=None, half=None, chunksize=None, batchsi raise FileNotFoundError("no model weights found in '%s'" % dirname) weights = max([int(re.sub(".*_([0-9]+).tar", "\\1", w)) for w in weight_files]) - device = torch.device(device) + if not use_openvino: + device = torch.device(device) config = toml.load(os.path.join(dirname, 'config.toml')) weights = os.path.join(dirname, 'weights_%s.tar' % weights) @@ -285,7 +287,7 @@ def load_model(dirname, device, weights=None, half=None, chunksize=None, batchsi model.encoder, batchsize=batchsize, chunksize=chunksize // model.stride, quantize=quantize ) - state_dict = torch.load(weights, map_location=device) + state_dict = torch.load(weights, map_location=device if not use_openvino else 'cpu') state_dict = {k2: state_dict[k1] for k1, k2 in match_names(state_dict, model).items()} new_state_dict = OrderedDict() for k, v in state_dict.items(): @@ -294,7 +296,10 @@ def load_model(dirname, device, weights=None, half=None, chunksize=None, batchsi model.load_state_dict(new_state_dict) - if half is None: + if use_openvino: + model = load_openvino_model(model, dirname) + + if half is None and device != torch.device('cpu'): half = half_supported() if half: model = model.half() diff --git a/setup.py b/setup.py index 271fabd6..71304378 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,9 @@ packages=find_packages(), include_package_data=True, install_requires=requirements, + extras_require = { + 'openvino': ['openvino==2021.4.2'], + }, long_description=long_description, long_description_content_type='text/markdown', author='Oxford Nanopore Technologies, Ltd',